Search code examples
compiler-constructionbisonflex-lexeryacclex

Why does Bison (Yacc) print new lines for apparently no reason?


I am trying to build a compiler for a simple formatting language using Flex and Bison. I am still at the beginning and I have coded some functionality.

At this stage though, I have still not printed anything to yyout anywhere. I have some error cases where something is printed in the output file but that clearly doesn't happen with this input. All my other print statements will print to the console. So, I expect that the output file will be totally empty. However, when I try to use the following as my input file:

\begin {document}

\tabsize( 5)
\title{"Why I Love Compiler Design"}
\author{"COMP421 Student"}
\date{29/12/2016}
\pagesetup{30,100 }

\end{document}

The output file generated is:

enter image description here

There are 9 empty lines, corresponding to the 9 lines I had in my input file. The output I expect however is only 1 empty line.

This is my .l file:

%{
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include "y.tab.h"
    void yyerror(const char *);
    int yylex(void);
    /* "Connect" with the output file  */
    extern FILE *yyout;
    extern int  yyparse();
%}

/* Allows printing the line number (of an error) */
%option yylineno

%%

^\\ { printf("LEX returned token BSLASH\n"); return BSLASH; } /* every backslash has to be at the start of a line */
\{  { printf("LEX returned token LBRACE\n"); return LBRACE; }
\}  { printf("LEX returned token RBRACE\n"); return RBRACE; }
\(  { printf("LEX returned token LPAREN\n"); return LPAREN; }
\)  { printf("LEX returned token RPAREN\n"); return RPAREN; }
,   { printf("LEX returned token COMMA\n"); return COMMA; }

begin    { printf("LEX returned token BEGIN_\n"); return BEGIN_; } /* Note the use of \ in this and other regex expressions to escape the following symbols: \, {, }, (, ), */
end      { printf("LEX returned token END\n"); return END; }
document { printf("LEX returned token DOCUMENT\n"); return DOCUMENT; }

pagesetup { printf("LEX returned token PAGESETUP\n"); return PAGESETUP; }
tabsize   { printf("LEX returned token TABSIZE\n"); return TABSIZE; }
title     { printf("LEX returned token TITLE\n"); return TITLE; }
author    { printf("LEX returned token AUTHOR\n"); return AUTHOR; }
date      { printf("LEX returned token DATE\n"); return DATE; }

(((0[1-9]|[12][0-9]|30)[-/ ]?(0[13-9]|1[012])|31[-/ ]?(0[13578]|1[02])|(0[1-9]|1[0-9]|2[0-8])[-/ ]?02)[-/ ]?[0-9]{4}|29[-/ ]?02[-/ ]?([0-9]{2}(([2468][048]|[02468][48])|[13579][26])|([13579][26]|[02468][048]|0[0-9]|1[0-6])00))  { printf("LEX returned token DDMMYYYYDATE\n"); yylval.sValue = yytext; return DDMMYYYYDATE; }
[0-9]*[1-9][0-9]*   { printf("LEX returned token INTEGER\n"); yylval.iValue = atoi(yytext); return INTEGER; }
\".*\"              { printf("LEX returned token STRING\n"); yylval.sValue = yytext; return STRING; }

    /* skip whitespace which is not part of a string */
[ \t] ;

    /* anything else is an error */
. yyerror("invalid character");

%%

int main(int argc, char *argv[]) {
    if ( argc != 3)
        yyerror("ERROR You need 2 args: inputFileName outputFileName");
    else {
        yyin = fopen(argv[1], "r");
        yyout = fopen(argv[2], "w");
        yyparse();
        fclose(yyin);
        fclose(yyout);
    }

    return 0;
}

This is my .y file:

%{
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include "y.tab.h"
    void yyerror(const char *);
    int yylex(void);

    /* "Connect" with the output file  */
    extern FILE *yyout;
    extern int  yylineno;

    /* An array with counters of how many times each of the 5 document properties appears in the input file. The order of the properties is defined in the enum below */
    int docPropertyCounters[5];

    /* An enumerated list with the 5 document properties */
    typedef enum {PAGE_SETUP, TAB_SIZE, DOC_TITLE, DOC_AUTHOR, DOC_DATE} document_property;

    /* Takes an integer and returns the corresponding document property as a string. The order is based on the enum. */
    static inline char *stringFromDocPropertyEnum(document_property indexOfProperty) {
        static char *strings[] = { "\\pagesetup{}", "\\tabsize()", "\\title{}", "\\author{}", "\\date{}"};
        return strings[indexOfProperty];
    }
%}

%union { 
    int iValue;      /* integer value */ 
    char* sValue;      /* C-String */ 
}; 

%start file /* defining the start condition */

%token BSLASH LBRACE RBRACE LPAREN RPAREN COMMA

%token BEGIN_ END DOCUMENT /* BEGIN seems to be a reserved word so BEGIN_ was used instead */

%token PAGESETUP TABSIZE TITLE AUTHOR DATE

%token <iValue> INTEGER

%token <sValue> DDMMYYYYDATE STRING

%%

file: beginDocument docProperties endDocument
            { 
                /* Checks for all possible errors in document properties */
                for (int i = 0; i < sizeof(docPropertyCounters)/sizeof(docPropertyCounters[0]); i++) 
                    if (docPropertyCounters[i] < 1) 
                        /* yyerror() is not used in this function because the line number does not need to be shown */
                        fprintf(stderr, "SYNTAX ERROR: Your source file does not contain the required document property %s", stringFromDocPropertyEnum(i)); 
                    else if (docPropertyCounters[i] > 1) 
                        fprintf(stderr, "SYNTAX ERROR: Your source file contains more than one instance of the document property %s", stringFromDocPropertyEnum(i));
            }
          | /* An empty document is parsed to an empty document, no errors generated*/
          ;

beginDocument: BSLASH BEGIN_ LBRACE DOCUMENT RBRACE;

docProperties: docProperties docProperty
               | /* empty */
               ;                

    /* required properties... there should be one instance of each in the input file */ 
docProperty:    pageSetupProperty { docPropertyCounters[PAGE_SETUP]++; }
                | tabSizeProperty { docPropertyCounters[TAB_SIZE]++; }
                | titleProperty   { docPropertyCounters[DOC_TITLE]++; }
                | authorProperty  { docPropertyCounters[DOC_AUTHOR]++; }
                | dateProperty    { docPropertyCounters[DOC_DATE]++; }
                ;   

pageSetupProperty: BSLASH PAGESETUP LBRACE INTEGER COMMA INTEGER RBRACE;

tabSizeProperty: BSLASH TABSIZE LPAREN INTEGER RPAREN;

titleProperty: BSLASH TITLE LBRACE STRING RBRACE;

authorProperty: BSLASH AUTHOR LBRACE STRING RBRACE;

dateProperty: BSLASH DATE LBRACE DDMMYYYYDATE RBRACE;

endDocument: BSLASH END LBRACE DOCUMENT RBRACE;

%%

int yywrap(void) {
    return 1;
}

void yyerror(const char* str) 
{
    fprintf(stderr,"SYNTAX ERROR near line [%d]: %s\n",yylineno, str);
}

PS: I am using Windows 10 and quite an old version of flex (2.5.4a).


Solution

  • These lines contain a carriage return and/or line feed \r\n because you have not put it into the whitespace pattern.

    Perhaps you should have:

    [ \t\r\n]      ;
    

    You should also be careful about using C style comments in the specification. Sometimes these are treated as patterns. I always advise students to only put C style comments in actual C code. For example,it is better to do this:

    [ \t\r\n]      ;  /* skip whitespace which is not part of a string */
    

    and never put comments elsewhere. Others may disagree, but I find it avoids an awful lot of grief in flex and bison.

    PS: I haven't tested my suggestion on your code....