compiler-construction bison flex-lexer yacc lex

LEX + YACC taking token in next line for a rule

I'm trying to build a parser for my own mini language, which is then translated into C++ by YACC itself.

The problem is, YACC is reading first line of input as well as the first token from the second line of the input and matching it with the corresponding rule, whereas it should have read only the tokens in first line of input and matched it with the corresponding rule

My input file is:

print "hello"
a = 10
print a

Lex file:

%{
    #include <stdio.h>
    #include "y.tab.h"  
%}

alpha   [a-zA-Z]
digit   [0-9]

%%
[ \t]                           ;
[ \n]                           { yylineno = yylineno + 1;}
print                           {yylval = strdup(yytext); return PRINT;}
{alpha}({alpha}|{digit})*       {yylval = strdup(yytext); return ID;}
{digit}+                        {yylval = strdup(yytext); return INTEGER;}
\".*\"                          {yylval = strdup(yytext); return STRING;}
"="                             return ASSIGN;
%%

YACC file is:

%{
    #include <stdio.h>
    #include <string.h>
    #include <stdlib.h>
    extern int yylineno;
    extern FILE *yyin;
    extern FILE *yyout;
    extern char *yytext; 
%}

%token PRINT INPUT INTO ASSIGN INTEGER DECIMAL BOOLVAL CHARACTER

%nonassoc STRING
%nonassoc ID

%%
entry:  entry action    {fprintf(yyout, "\t%s", $2); }
    | action            {fprintf(yyout, "\t%s", $1); }
    ;

action : print          {$$ = $1;}
    | assign            {$$ = $1;}
    ;

print : PRINT ID    {
            printf("rule: PRINT ID");
            char* id = strdup($2);
            strcpy($$, "");
            strcat($$,"cout<<");
            strcat($$,id);
            strcat($$,"<<endl;\n");
        }
    | PRINT STRING  {
            printf("rule: PRINT STRING\n");
            char* str = strdup($2);
            strcpy($$, "");
            strcat($$,"cout<<");
            strcat($$,str);
            strcat($$,"<<endl;\n");
        }
    | PRINT STRING ID   {
            printf("rule: PRINT STRING ID\n");
            char* str = strdup($2);
            char* id = strdup($3);
            strcpy($$, "");
            strcat($$,"cout<<");
            strcat($$,str);
            strcat($$,"<<");
            strcat($$,id);
            strcat($$,"<<endl;\n");
        }
    ;

assign: ID ASSIGN INTEGER {
            char* id = strdup($1);
            char* val = strdup($3);
            strcpy($$,"");
            strcat($$,"int ");
            strcat($$,id);
            strcat($$," = ");
            strcat($$,val);
            strcat($$,";\n");
        }
    ;
%%

int main(int argc, char *argv[])
{

    yyin = fopen(argv[1], "r");
    yyout = fopen("out.txt","w");

    if(!yyparse())
        printf("\nParsing complete\n");
    else
        printf("\nParsing failed\n");

    //fclose(yyin);
    fclose(yyout);
    return 0;
}

yyerror(char *s) {
    printf("\n \nLine: %d, Message: %s, Cause: %s\n", yylineno, s, yytext );
}

yywrap()
{
    return 1;
}

Expected output is:

cout<<"hello"<<endl;
int a = 10;
cout<<a<<endl;

But the parsing fails, with partial output as:

cout<<"hello"<<a<<endl;

And error message:

Line: 2, Message: syntax error, Cause: =

The rules used to reduce are supposed to be (in same order) :

PRINT STRING
ID ASSIGN INTEGER
PRINT ID

but, the first rule being used to reduce is:

PRINT STRING ID

and the parsing fails

ID is in the next line, after PRINT STRING, but still the rule used is PRINT STRING ID.

I've given a lesser precedence to STRING over ID (I guess that is what the below code means)

%nonassoc STRING
%nonassoc ID

Is that the problem?

I'm unable to understand what is happening. Am I missing something?

Solution

One of your valid print actions is

PRINT STRING ID

Your input matches this action, but immediately following this input is an = sign, which the parser can't match as the start of any other action.

It seems like you want newlines to delimit your actions. Therefore, you need to explicitly create an end of action token, update your grammar so that your actions end with that token, and have the lexer generate the token when it sees a newline.