Search code examples
ccompiler-constructiongrammarbisonflex-lexer

Getting Syntax Error in flex/bison with this grammar


I am trying to generate middle code for a compiler. I am using flex-bison on macOS. When I try an input I get syntax error and I do not know where that syntax error came from.

This is my lex:

%{
#include<stdbool.h>
#include <stdlib.h>
#include <string.h>
//#include "LinkedList.h"
#include "parser.tab.h"
char* tempString;
%}
%option yylineno

whole   0|[1-9][0-9]*
real    {whole}\.(([0-9])|([0-9][0-9]*[1-9]))
divisionop [/]
noSymbol    [^ \t\n\/\+\-\*\:\;\,\(\)\.]
error   {noSymbol}+

%%


[ \t\n\r]+  ;



\#[-+]?{whole}/[\ :\,\;\+\-\/\*\)\n\r]|".GT."|".GE."|".EQ."|".NE."|".LT."|".LE."|":="   {

    char* string = (char*) malloc(strlen(yytext));
    strcpy(string, yytext + 1);
    string[strlen(string)] = 0;
    //printf("Found integer : %d\n", atoi(string));
    //push(string,INTEGER,yylineno);
    return INTEGER;}


\#[-+]?{real}/[\ :\,\;\+\-\/\*\)\n\r]|".GT."|".GE."|".EQ."|".NE."|".LT."|".LE."|":="    {
    char* string = (char*) malloc(strlen(yytext));
    strcpy(string, yytext + 1);
    string[strlen(string)] = 0;
    //printf("Found real:%s\n",string);
    //push(string,REAL,0);
    return REALNUM;
}

[a-zA-Z][0-9][0-9a-zA-Z]*   {
    //printf("Found identifier:%s\n",yytext);
    //yylval.str = yytext;
    //printf("****lvalue is:%s\n",yylval.str);
    //push(yytext,IDENTIFIER,yylineno);
    tempString = (char*) malloc(strlen(yytext));
    strcpy(tempString, yytext);
    return IDENTIFIER;
}

Program {
    //printf("Found program :%s\n",yytext);
    return PROGRAM;

}

Int {
    //printf("Found int :%s\n",yytext);
    return INT;

}

Real    {
    //printf("Found real :%s\n",yytext);
    return REAL;

}

Bool    {
    //printf("Found bool :%s\n",yytext);
    return BOOL;

}

Procedure   {
    //printf("Found real :%s\n",yytext);
    return PROCEDURE;

}

Function    {
    //printf("Found function :%s\n",yytext);
    return FUNCTION;

}

Begin   {
    //printf("Found begin :%s\n",yytext);
    return BEGINN;

}

End {
    //printf("Found end :%s\n",yytext);
    return END;

}

If  {
    //printf("Found if :%s\n",yytext);
    return IF;
}

Then {
    //printf("Found then :%s\n",yytext);
    return THEN;

}

Else {
    //printf("Found else :%s\n",yytext);
    return ELSE;
}

While {
    //printf("Found while :%s\n",yytext);
    return WHILE;
}

Do {
    //printf("Found do :%s\n",yytext);
    return DO;
}

For {
    //printf("Found for :%s\n",yytext);
    return FOR;
}

To {
    //printf("Found to :%s\n",yytext);
    return TO;
}

Downto {
    //printf("Found downto :%s\n",yytext);
    return DOWNTO;
}

Case {
    //printf("Found case :%s\n",yytext);
    return CASE;
}

Return {
    //printf("Found return :%s\n",yytext);
    return RETURN;
}

"And Then" {
    //printf("Found andthen :%s\n",yytext);
    //push(yytext,ANDTHEN,yylineno);
    return ANDTHEN;
}

"Or Else" {
    //printf("Found andthen :%s\n",yytext);
    //push(yytext,ORELSE,yylineno);
    return ORELSE;
}

"+" {
    //printf("Found plus :%s\n",yytext);
    return PLUS;
}

\-  {
    //printf("Found minus :%s\n",yytext);
    return MINUS;

}

\*  {
    //printf("Found multiply :%s\n",yytext);
    return MULTIPLY;
}

{divisionop}    {
    //printf("Found division :%s\n",yytext);
    //push(yytext,DIVISION,yylineno);
    return DIVISION;
}

".GT."  {
    //printf("Found greaterthan :%s\n",yytext);
    return GREATERTHAN;
}

".GE."  {
    //printf("Found greaterequal :%s\n",yytext);
    return GREATEREQUAL;
}

\.NE\.  {
    //printf("Found notequal :%s\n",yytext);
    return NOTEQUAL;
}

\.EQ\.  {
    //printf("Found EQUAL :%s\n",yytext);
    return EQUAL;
}

\.LT\.  {
    //printf("Found lessthan :%s\n",yytext);
    return LESSTHAN;
}

\.LE\.  {
    //printf("Found lessequal :%s\n",yytext);
    return LESSEQUAL;
}

[,] {
    //printf("Found comma :%s\n",yytext);
    return COMMA;
}

":="    {
    //printf("Found declare :%s\n",yytext);
    return DECLARE;
}

[:] {
    //printf("Found semicolon :%s\n",yytext);
    return COLON;
}

[;] {
    //printf("Found semicolon :%s\n",yytext);
    return SEMICOLON;
}


\( {
    //printf("Found oppar :%s\n",yytext);

    return OPPAR;

}

\)  {
    //printf("Found cppar :%s\n",yytext);
    return CPAR;
}

False   {
    //printf("Found false :%s\n",yytext);
    return FALSE;
}

True    {
    //printf("Found true :%s\n",yytext);
    return TRUE;
}

{error} {
    printf("Error on : **<<  %s  >>**\n", yytext);
    //yymore();
}
\.  {
    printf("Error on : **<<  illegal use of \" %s \" >>**\n", yytext);
    //yymore();
}

%%


int yywrap(){
    return 1;
}

//int main(){
//  yyin = fopen("input2P.txt", "r");
//  initLinkedList(table);
//  while(yylex());
//  printLinkedList();
//  return 0;
//}

this is my parser.y

%{
#include<stdio.h>
#include "stack.h"
//#include "LinkedList.h"
//#include "symbol_table.h"

int currentType;
char* returnType;

extern FILE* yyin;
extern char* tempString;
struct Stack* tblptrStack;

char* new_temp(char *c) {
    string name("t");
    name += to_string(num);
    num++;
    char *what = (char *) malloc(sizeof(char) * 100);
    strcpy(what, name.c_str());
    symbol_table_insert(what, c);
    strcpy(what, symbol_table_lookup(what).id.c_str());
    return what;
}


int yylex();
void yyerror(char* error);
struct SymbolTable* secondArg(struct SymbolTable* a,struct SymbolTable* b );
%}

%union{
  char *str;
   struct {
        int quad;
        int is_boolean;
        char *place;
        char *code;
        char *type;
   } eval;
}

}

%start program
%token INTEGER
%token ZERO
%token REALNUM
%token PROGRAM
%token INT
%token REAL
%token BOOL
%token PROCEDURE
%token FUNCTION
%token BEGINN
%token END
%token IF
%token THEN
%token ELSE
%token WHILE
%token DO
%token FOR
%token TO
%token DOWNTO
%token CASE
%token RETURN
%token ANDTHEN
%token ORELSE
%token IDENTIFIER
%token PLUS
%token MINUS
%token MULTIPLY
%token DIVISION
%token GREATERTHAN
%token GREATEREQUAL
%token NOTEQUAL
%token EQUAL
%token LESSTHAN
%token LESSEQUAL
%token COMMA
%token SEMICOLON
%token COLON
%token DECLARE
%token OPPAR
%token CPAR
%token FALSE
%token TRUE
%token ERROR
%type <eval> exp

%left COMMA
%left INT BOOL REAL

%left COLON
%left IF_PREC   
%left ELSE
%left ANDTHEN ORELSE
%left PLUS MINUS
%left MULTIPLY DIVISION
%left GREATERTHAN GREATEREQUAL NOTEQUAL EQUAL LESSTHAN LESSEQUAL

%%

program:
    PROGRAM IDENTIFIER M SEMICOLON declist block SEMICOLON {
        printf("\nin program\n"); allSymbolTablePrint( pop(tblptrStack) ); 
    }
    | PROGRAM IDENTIFIER M SEMICOLON block SEMICOLON {
        printf("\nin program\n"); allSymbolTablePrint( pop(tblptrStack) ); 
    }
    ;
M:
    { 
        struct SymbolTable* t = mkTable( NULL , "program");
        push(t, tblptrStack);
    }
;
declist:
    dec
    | declist dec 
    ;
dec : 
    vardec {printf("var deccc");}
    | procdec
    | funcdec
    ;
type : 
    INT {
        currentType=4;
        returnType = (char*) malloc(strlen("INT"));
        strcpy(returnType, "INT");
        printf("this is int");
    }
    | REAL {
        currentType=8;
        returnType = (char*) malloc(strlen("REAL"));
        strcpy(returnType, "REAL");
    }
    |BOOL {
        currentType=1;
        returnType = (char*) malloc(strlen("BOOL"));
        strcpy(returnType, "BOOL");
    }
    ;
iddec : 
    IDENTIFIER { enterVar( tempString, currentType , 0 , top(tblptrStack) ); printf("a variable entered:%s\n",tempString); }
    | IDENTIFIER { enterVar( tempString, currentType , 0 , top(tblptrStack) ); printf("a variable entered:%s\n",tempString); } DECLARE {
    printf("this is declare eeee");
    } exp 
    ;
idlist : 
    iddec {printf("this is idlist iddec");}
    | idlist COMMA iddec
    ;
vardec :
    type idlist SEMICOLON 
    {printf("vardec");}
    ;
procdec :
    PROCEDURE IDENTIFIER NP OPPAR paramdecs CPAR declist block SEMICOLON {
        char* tmpc = top(tblptrStack)->name;
        struct SymbolTable* tmpt = pop(tblptrStack);
        enterProcFunc( tmpc , 0 , "NULL" ,  tmpt , top(tblptrStack) ); 
    }
    | PROCEDURE IDENTIFIER NP OPPAR paramdecs CPAR block SEMICOLON { 
        char* tmpc = top(tblptrStack)->name;
        struct SymbolTable* tmpt = pop(tblptrStack);
        enterProcFunc( tmpc , 0 , "NULL" ,  tmpt , top(tblptrStack) ); 
    }
    ;
NP:
    {
        printf("inside NP\t tempString:%s\n",tempString);
        struct SymbolTable* t = mkTable( top(tblptrStack) , tempString );
        push(t, tblptrStack);
    }
    ;
funcdec :
    FUNCTION IDENTIFIER FN OPPAR paramdecs CPAR COLON type {
        char* tmpc = top(tblptrStack)->name;
        struct SymbolTable* tmpt = pop(tblptrStack);
        enterProcFunc(tmpc , -1 , returnType ,  tmpt , top(tblptrStack) );
    } declist block SEMICOLON
    | FUNCTION IDENTIFIER FN OPPAR paramdecs CPAR COLON type {
        char* tmpc = top(tblptrStack)->name;
        struct SymbolTable* tmpt = pop(tblptrStack);
        enterProcFunc(tmpc , -1 , returnType ,  tmpt , top(tblptrStack) );
    } block SEMICOLON
    ;
FN:
    {
        printf("inside FN\t tempString:%s\n",tempString);
        struct SymbolTable* t = mkTable( top(tblptrStack) , tempString );
        push(t, tblptrStack);
    }
    ;
paramdecs :
    paramdec { printf("paramdecs"); }
    | paramdecs SEMICOLON paramdec { printf("paramdecs\n"); }
    | { printf("paramdecs\n"); }
    ;
paramdec : 
    type paramlist
    ;
paramlist :
    IDENTIFIER 
    | paramlist COMMA IDENTIFIER
    ;
block : 
    BEGINN stmtlist END
    | stmt 
    ;
stmtlist : 
    stmt 
    | stmtlist SEMICOLON stmt 
    ;
lvalue : 
    IDENTIFIER
    ;
stmt :
    lvalue DECLARE exp 
    | IF exp THEN block %prec IF_PREC
    | IF exp THEN block ELSE block %prec ELSE
    | WHILE exp DO block 
    | FOR lvalue DECLARE exp TO exp DO block 
    | FOR lvalue DECLARE exp DOWNTO exp DO block 
    | CASE exp caseelement END 
    | RETURN exp 
    | exp {printf(stmt exp);}
    ;
exp : 
    exp ANDTHEN exp 
    | exp ORELSE exp  
    | exp PLUS exp   {
        printf("Rule  \t\t mathlogicExpression -> mathlogicExpression KW_PLUS mathlogicExpression\n");
        $$.place = new_temp($1.type);
        $$.type = $1.type;
        printf($1.place, $3.place, "+", $$.place);
    };
    | exp MINUS exp 
    | exp MULTIPLY exp { printf(" RULE FOR MULTIPLY\n");}
    | exp DIVISION exp 
    | OPPAR exp CPAR 
    | boolexp relop boolexp
    | INTEGER {printf("this is integerrrr");}
    | REALNUM 
    | TRUE
    | FALSE 
    | lvalue 
    | IDENTIFIER OPPAR explist CPAR 
    ;
boolexp:
    OPPAR exp CPAR 
    | INTEGER 
    | REALNUM 
    | TRUE
    | FALSE 
    | lvalue 
    | IDENTIFIER OPPAR explist CPAR 
    ;
caseelement : 
    INTEGER COLON block SEMICOLON
    | caseelement INTEGER COLON block SEMICOLON
    ;
explist :
    exp 
    | explist COMMA exp  
    |
    ;
relop :
    GREATERTHAN
    | GREATEREQUAL
    | NOTEQUAL
    | EQUAL
    | LESSTHAN
    | LESSEQUAL
    ;


%%

struct SymbolTable* popTop(struct Stack* b ){
    pop(b);
    top(b);
}

int main(){
    //printf("hi1");
    tblptrStack=createStack();
    tblptrStack->top = NULL;
    //printf("hi2");
    yyin = fopen("input3.txt", "r");
    //printf("hi3");
    yyparse();
    return 0;
}

void yyerror(char* error){
    printf("Error : %s\n", error);
} 

I tried this input for example:

Program p1Main;

    Int i1, i2:=#-23;   
Begin
    i1 := i1 + i2;

End
;

and I get this error

a variable entered:i1
a variable entered:i2
Error : syntax error
logout
Saving session...
...copying shared history...
...saving history...truncating history files...
...completed.

I have tried to put printf s to find out where this syntax error comes from but I did not get any results. I have seen the other questions like this and tried their solution but it didn't work. Please help me if you know how to solve this.


Solution

  • You are getting a syntax error because your input program is syntactically incorrect and the parser is working properly.

    The (trimmed) grammar shows a program as:

    program:    PROGRAM IDENTIFIER M SEMICOLON declist block SEMICOLON
    

    and it shows a block as:

    block : 
        BEGINN stmtlist END
        | stmt 
        ;
    

    and subsequently a statement list is:

    stmtlist : 
        stmt 
        | stmtlist SEMICOLON stmt 
        ;
    

    Here, you will see that a SEMICOLON is a statement separator and not a terminator. This means there can never be a SEMICOLON before an END. However you did that:

     Begin
        i1 := i1 + i2;
    
    End
    

    So you have to decide which is the correct interpretation of the language; your grammar or your input program. Beginners make this mistake so often it takes an experienced programmer or teacher a few seconds to spot. How could you have used bison to learn this? I'll add that detail following ...


    How could you have found this on your own? Well, following the rules of Stack Overflow would have been a good start. You have not reduced your code example to the minimum required to explain your problem. You just dumped THE WHOLE THING into your question and just gave up and handed it over to us.

    How could a good student have simplified this problem. One thing you could have done is simplified your test input to the parser and tried simpler and simpler input programs. For example, ask yourself "does the error occur in the declarations or the block?". "What happens if I use a simpler program with no declarations?". Using this form of deduction you might be able to deduce that the fault lies somewhere in the BLOCK. You could have tried examples that matched the different rules in BLOCK and statement list to see which alternatives worked and which did not and eventually find the rule that caused problems.

    You could then have created a much smaller grammar, lexer and test program to paste into Stack Overflow which explained the specific problem with the specific grammar rule that was failing and ask for help at that point. However, usually, in the process of creating the smaller example the problem and solution reveals itself.

    A more powerful and programmatic way of finding what caused the issue is to use the debug features built into bison. Bison has a test mode which is enabled by running bison with the -t argument and also setting the yydebug variable at run time. This will produce a detailed trace of the parse which will show you exactly which input symbol and rule caused the fault. I'll show you what that produces in the next installment following ...


    OK. I've now, after some issues with your pasted code produced the desired output. You have incorrectly pasted invalid code into the question. There is an extra "}" on line 41 of parser.y which I had to delete and we do not have your stack.h. I had to remove all your semantic actions to show a grammar debug. Anyway, after doing that and running with bison debug enabled we get the following (on my Mac):

    briantompsett:~ cssbct$ gcc parser.tab.c lex.yy.c -ll -o parser
    briantompsett:~ cssbct$ ./parser
    Starting parse
    Entering state 0
    Reading a token: Program p1Main;
    
        Int i1, i2:=#-23;   
    Begin
        i1 := i1 + i2;
    
    End
    ;Next token is token PROGRAM ()
    Shifting token PROGRAM ()
    Entering state 1
    Reading a token: Next token is token IDENTIFIER ()
    Shifting token IDENTIFIER ()
    Entering state 3
    Reducing stack by rule 3 (line 108):
    -> $$ = nterm M ()
    Stack now 0 1 3
    Entering state 5
    Reading a token: Next token is token SEMICOLON ()
    Shifting token SEMICOLON ()
    Entering state 6
    Reading a token: Next token is token INT ()
    Shifting token INT ()
    Entering state 9
    Reducing stack by rule 9 (line 122):
       $1 = token INT ()
    -> $$ = nterm type ()
    Stack now 0 1 3 5 6
    Entering state 26
    Reading a token: Next token is token IDENTIFIER ()
    Shifting token IDENTIFIER ()
    Entering state 50
    Reading a token: Next token is token COMMA ()
    Reducing stack by rule 12 (line 133):
       $1 = token IDENTIFIER ()
    -> $$ = nterm iddec ()
    Stack now 0 1 3 5 6 26
    Entering state 51
    Reducing stack by rule 15 (line 139):
       $1 = nterm iddec ()
    -> $$ = nterm idlist ()
    Stack now 0 1 3 5 6 26
    Entering state 52
    Next token is token COMMA ()
    Shifting token COMMA ()
    Entering state 82
    Reading a token: Next token is token IDENTIFIER ()
    Shifting token IDENTIFIER ()
    Entering state 50
    Reading a token: Next token is token DECLARE ()
    Shifting token DECLARE ()
    Entering state 81
    Reducing stack by rule 13 (line 134):
    -> $$ = nterm @1 ()
    Stack now 0 1 3 5 6 26 52 82 50 81
    Entering state 110
    Reading a token: Next token is token INTEGER ()
    Shifting token INTEGER ()
    Entering state 7
    Reading a token: Next token is token SEMICOLON ()
    Reducing stack by rule 52 (line 206):
       $1 = token INTEGER ()
    -> $$ = nterm exp ()
    Stack now 0 1 3 5 6 26 52 82 50 81 110
    Entering state 124
    Next token is token SEMICOLON ()
    Reducing stack by rule 14 (line 134):
       $1 = token IDENTIFIER ()
       $2 = token DECLARE ()
       $3 = nterm @1 ()
       $4 = nterm exp ()
    -> $$ = nterm iddec ()
    Stack now 0 1 3 5 6 26 52 82
    Entering state 111
    Reducing stack by rule 16 (line 140):
       $1 = nterm idlist ()
       $2 = token COMMA ()
       $3 = nterm iddec ()
    -> $$ = nterm idlist ()
    Stack now 0 1 3 5 6 26
    Entering state 52
    Next token is token SEMICOLON ()
    Shifting token SEMICOLON ()
    Entering state 83
    Reducing stack by rule 17 (line 143):
       $1 = nterm type ()
       $2 = nterm idlist ()
       $3 = token SEMICOLON ()
    -> $$ = nterm vardec ()
    Stack now 0 1 3 5 6
    Entering state 27
    Reducing stack by rule 6 (line 117):
       $1 = nterm vardec ()
    -> $$ = nterm dec ()
    Stack now 0 1 3 5 6
    Entering state 25
    Reducing stack by rule 4 (line 113):
       $1 = nterm dec ()
    -> $$ = nterm declist ()
    Stack now 0 1 3 5 6
    Entering state 24
    Reading a token: this is idlist iddecthis is declare eeeethis is integerrrrvardecvar decccNext token is token BEGINN ()
    Shifting token BEGINN ()
    Entering state 14
    Reading a token: Next token is token IDENTIFIER ()
    Shifting token IDENTIFIER ()
    Entering state 20
    Reading a token: Next token is token DECLARE ()
    Reducing stack by rule 34 (line 184):
       $1 = token IDENTIFIER ()
    -> $$ = nterm lvalue ()
    Stack now 0 1 3 5 6 24 14
    Entering state 31
    Next token is token DECLARE ()
    Shifting token DECLARE ()
    Entering state 54
    Reading a token: Next token is token IDENTIFIER ()
    Shifting token IDENTIFIER ()
    Entering state 20
    Reading a token: Next token is token PLUS ()
    Reducing stack by rule 34 (line 184):
       $1 = token IDENTIFIER ()
    -> $$ = nterm lvalue ()
    Stack now 0 1 3 5 6 24 14 31 54
    Entering state 39
    Next token is token PLUS ()
    Reducing stack by rule 56 (line 210):
       $1 = nterm lvalue ()
    -> $$ = nterm exp ()
    Stack now 0 1 3 5 6 24 14 31 54
    Entering state 84
    Next token is token PLUS ()
    Shifting token PLUS ()
    Entering state 57
    Reading a token: Next token is token IDENTIFIER ()
    Shifting token IDENTIFIER ()
    Entering state 20
    Reading a token: Next token is token SEMICOLON ()
    Reducing stack by rule 34 (line 184):
       $1 = token IDENTIFIER ()
    -> $$ = nterm lvalue ()
    Stack now 0 1 3 5 6 24 14 31 54 84 57
    Entering state 39
    Next token is token SEMICOLON ()
    Reducing stack by rule 56 (line 210):
       $1 = nterm lvalue ()
    -> $$ = nterm exp ()
    Stack now 0 1 3 5 6 24 14 31 54 84 57
    Entering state 87
    Next token is token SEMICOLON ()
    Reducing stack by rule 46 (line 200):
       $1 = nterm exp ()
       $2 = token PLUS ()
       $3 = nterm exp ()
    -> $$ = nterm exp ()
    Stack now 0 1 3 5 6 24 14 31 54
    Entering state 84
    Next token is token SEMICOLON ()
    Reducing stack by rule 35 (line 187):
       $1 = nterm lvalue ()
       $2 = token DECLARE ()
       $3 = nterm exp ()
    -> $$ = nterm stmt ()
    Stack now 0 1 3 5 6 24 14
    Entering state 38
    Reducing stack by rule 32 (line 180):
       $1 = nterm stmt ()
    -> $$ = nterm stmtlist ()
    Stack now 0 1 3 5 6 24 14
    Entering state 37
    Next token is token SEMICOLON ()
    Shifting token SEMICOLON ()
    Entering state 71
    Reading a token: Next token is token END ()
    Error : syntax error
    Error: popping token SEMICOLON ()
    Stack now 0 1 3 5 6 24 14 37
    Error: popping nterm stmtlist ()
    Stack now 0 1 3 5 6 24 14
    Error: popping token BEGINN ()
    Stack now 0 1 3 5 6 24
    Error: popping nterm declist ()
    Stack now 0 1 3 5 6
    Error: popping token SEMICOLON ()
    Stack now 0 1 3 5
    Error: popping nterm M ()
    Stack now 0 1 3
    Error: popping token IDENTIFIER ()
    Stack now 0 1
    Error: popping token PROGRAM ()
    Stack now 0
    Cleanup: discarding lookahead token END ()
    Stack now 0
    

    You'll see that now it stopped at the SEMICOLON before the END, as shown by this part of the trace:

    Next token is token SEMICOLON ()
    Shifting token SEMICOLON ()
    Entering state 71
    Reading a token: Next token is token END ()
    Error : syntax error
    Error: popping token SEMICOLON ()
    

    That's how you should debug in the future.