Search code examples
cparsingbisonoperator-precedence

Bison: Unexpected token error when trying to parse a multiline expression


I'm making a Visual Basic parser with Flex and Bison for a uni assignment. Most of it seems to be working properly except for the parsing of multiline expressions. Here is an example of a bit of code that doesn't work:

A = A +
1

And what is interesting is that removing A = makes it parse properly. Same applies if I replace = with an operator with higher or equal precedence than that of +.

The parser seems to prioritize single line expressions over multiline ones. I understand that this is a precedence issue, but I have no idea how to solve this.

Truncated Flex code:

%option nounistd
%option noyywrap
%option case-insensitive

%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "parcer-mini.tab.h"

#pragma warning(disable : 4996)
#define YY_DECL int yylex()

int result = 0;

%}
%x STRING_LITERAL

%%
%{
    char buf[100000];
    memset(buf, '\0', 100000);
%}

"+"                 { return '+'; }
"-"                 { return '-'; }
"*"                 { return '*'; }
"/"                 { return '/'; }
"="                 { return '='; }
"<"                 { return '<'; }
">"                 { return '>'; }
"^"                 { return '^'; }

([0-9]([0-9]+)?)  {yylval.int_val = atoi(yytext); return INT_VALUE;}

([a-zA-Z_])([a-zA-Z_0-9])* {yylval.id_var_name = (char *) malloc(strlen(yytext)+1); strcpy(yylval.id_var_name, yytext); return IDENTIFIER;}

\n+  {return END_OF_LINE;}

<<EOF>> { static int once = 0; return once++ ? 0 : END_OF_LINE;}
%%

Truncated Bison code:

%define parse.error verbose
%{
#pragma warning(disable : 4996)
#include <stdio.h>
#include <stdlib.h>

extern int yylineno;
extern FILE* yyin;

extern int yyparse();
extern int yylex();

void yyerror(const char* s);
%}

%union {
    int int_val;
    char* id_var_name;
}

%type <expression> expr_singleline expr_multiline basic_literal_value;
%type <statement> stmt root;
%type stmt_ends;

%token<int_val> INT_VALUE

%token<id_var_name> IDENTIFIER

%token END_OF_LINE

%left '='
%left '>' '<'

%left '+' '-'
%left '*' '/'
%right UNARY_MINUS UNARY_PLUS
%left '^'

%precedence IDENTIFIER

%start root

%%

root: stmt {printf("root 1\n");}
    ;
    
stmt: expr_multiline stmt_ends {printf("stmt 1\n");}
    | expr_singleline stmt_ends {printf("stmt 2\n");}
    ;          
    
stmt_ends: END_OF_LINE {printf("stmt_ends 1\n");}
    | stmt_ends END_OF_LINE {printf("stmt_ends 2\n");}
    ;
    
expr_singleline: basic_literal_value {printf("expr_single 0\n");}
    | '-' expr_singleline   %prec UNARY_MINUS {printf("expr_single 1\n");}
    | '+' expr_singleline   %prec UNARY_PLUS {printf("expr_single 2\n");}
    | expr_singleline '+' expr_singleline {printf("expr_single 3\n");}
    | expr_singleline '-' expr_singleline {printf("expr_single 4\n");}
    | expr_singleline '*' expr_singleline {printf("expr_single 5\n");}
    | expr_singleline '/' expr_singleline {printf("expr_single 6\n");}
    | expr_singleline '=' expr_singleline {printf("expr_single 8\n");}
    | expr_singleline '<' expr_singleline {printf("expr_single 9\n");}
    | expr_singleline '>' expr_singleline {printf("expr_single 10\n");}
    | expr_singleline '^' expr_singleline {printf("expr_single 11\n");}
    | IDENTIFIER {printf("expr_single 17\n");}
    ;

expr_multiline: expr_singleline '+' END_OF_LINE expr_singleline {printf("expr_multi 1\n");}
              | expr_singleline '-' END_OF_LINE expr_singleline {printf("expr_multi 2\n");}
              | expr_singleline '*' END_OF_LINE expr_singleline {printf("expr_multi 3\n");}
              | expr_singleline '/' END_OF_LINE expr_singleline {printf("expr_multi 4\n");}
              | expr_singleline '=' END_OF_LINE expr_singleline {printf("expr_multi 6\n");}
              | expr_singleline '<' END_OF_LINE expr_singleline {printf("expr_multi 7\n");}
              | expr_singleline '>' END_OF_LINE expr_singleline {printf("expr_multi 8\n");}
              | expr_singleline '^' END_OF_LINE expr_singleline {printf("expr_multi 9\n");}
              ;

basic_literal_value: INT_VALUE {printf("basic_literal_value int\n");}
                   ;                   
%%
int main(int argc, char** argv) {
    if (argc > 1) {
        yyin = fopen(argv[1], "r");
        yyparse();
    }
    else {
        yyerror("not found file");
    }
}

void yyerror(const char* s) {
    fprintf(stderr, "Parse error: %s\n", s);
    exit(1);    
}

Solution

  • I couldn't figure out how to solve the aforementioned issue, but I found a workaround. By creating an optional End of Line token, I merged expr_singleline and expr_multiline. This method works perfectly.

    Modified truncated Bison code:

    %define parse.error verbose
    %{
    #pragma warning(disable : 4996)
    #include <stdio.h>
    #include <stdlib.h>
    
    extern int yylineno;
    extern FILE* yyin;
    
    extern int yyparse();
    extern int yylex();
    
    void yyerror(const char* s);
    %}
    
    %union {
        int int_val;
        char* id_var_name;
    }
    
    %type <expression> expr basic_literal_value;
    %type <statement> stmt root;
    %type stmt_ends optEoL;
    
    %token<int_val> INT_VALUE
    
    %token<id_var_name> IDENTIFIER
    
    %token END_OF_LINE
    
    %left '='
    %left '>' '<'
    
    %left '+' '-'
    %left '*' '/'
    %right UNARY_MINUS UNARY_PLUS
    %left '^'
    
    %precedence IDENTIFIER
    
    %start root
    
    %%
    
    root: stmt {printf("root 1\n");}
        ;
        
    stmt: expr stmt_ends {printf("stmt 2\n");}
        ;          
        
    stmt_ends: END_OF_LINE {printf("stmt_ends 1\n");}
        | stmt_ends END_OF_LINE {printf("stmt_ends 2\n");}
        ;
        
    optEoL: /*empty*/
          | END_OF_LINE {printf("optEoL 1\n");}
          ;
        
    expr: basic_literal_value {printf("expr_single 0\n");}
        | '-' expr %prec UNARY_MINUS {printf("expr_single 1\n");}
        | '+' expr %prec UNARY_PLUS {printf("expr_single 2\n");}
        | expr '+' optEoL expr_singleline {printf("expr_single 3\n");}
        | expr '-' optEoL expr_singleline {printf("expr_single 4\n");}
        | expr '*' optEoL expr_singleline {printf("expr_single 5\n");}
        | expr '/' optEoL expr_singleline {printf("expr_single 6\n");}
        | expr '=' optEoL expr_singleline {printf("expr_single 8\n");}
        | expr '<' optEoL expr_singleline {printf("expr_single 9\n");}
        | expr '>' optEoL expr_singleline {printf("expr_single 10\n");}
        | expr '^' optEoL expr_singleline {printf("expr_single 11\n");}
        | IDENTIFIER {printf("expr_single 17\n");}
        ;
    
    basic_literal_value: INT_VALUE {printf("basic_literal_value int\n");}
                       ;                   
    %%
    int main(int argc, char** argv) {
        if (argc > 1) {
            yyin = fopen(argv[1], "r");
            yyparse();
        }
        else {
            yyerror("not found file");
        }
    }
    
    void yyerror(const char* s) {
        fprintf(stderr, "Parse error: %s\n", s);
        exit(1);    
    }