Search code examples
parsingflex-lexeryacclexer

flex - Simple parser gives error: fatal flex scanner internal error--end of buffer missed


I'm trying to implement a simple parser that calculates addition, subtraction, multiplication and division using fractional numbers. Fractional numbers in this form: nominatorfdenominator like this 2f3 4f6 9f4 etc. Parser should run on REPL mode. To compile and run:

lex lexer.l
yacc -d parser.y
cc lex.yy.c y.tab.c -lm -o main
./main

flex code:

%{
  #include "y.tab.h"
  extern YYSTYPE yylval;
  #include <math.h>
  void to_int(char* num, int* arr);
%}

IDENTIFIER_ERROR [0-9][a-zA-Z0-9_]*
COMMENT ";;".*   
VALUESTR \"(.*?)\"

%%

[ \t\f\v\n] { ; }

exit   { return KW_EXIT; }

"+"  { return OP_PLUS; }
"-"  { return OP_MINUS; }
"/"  { return OP_DIV; }
"*"  { return OP_MULT; }
"("  { return OP_OP; }
")"  { return OP_CP; }

(0)|([1-9]+"f"[1-9]*)    { to_int(yytext, yylval.INT_ARR); return VALUEF; }
[a-zA-Z_]([a-zA-Z0-9_]*) { strcpy(yylval.STR, yytext); return IDENTIFIER; } 
{COMMENT}    { printf("%s: COMMENT\n",    yytext); }
{IDENTIFIER_ERROR} { printf("%s: SYNTAX ERROR\n", yytext); exit(1); }
                     
. { printf("%s: SYNTAX ERROR\n", yytext); exit(1); }



%%

// fractional number taken as a string, converting it to: arr[0] = nominator, arr[0]  = nominator, arr[1] = denominator,
void to_int(char* num, int* arr) {
  char* nominator, *denominator;

  strcpy(nominator, num); // nominator contains whole number for now
  strtok_r(nominator, "f", &denominator);

  //printf ("lex: NUMS parsed as: %s  %s\n", nominator, denominator);

  arr[0] = atoi(nominator);
  arr[1] = atoi(denominator);

  //printf("lex: nom: %d denom: %d\n", arr[0], arr[1]);

}

int yywrap(){
  return 1;
}

yacc file:

%{
    #include <stdio.h>
    #include <stdlib.h>
  #include <string.h>
  #include <math.h>

  int yylex(void);
  void yyerror(char *str);

  void fractional_divide(int* num1, int* num2, int* RESULTF);
  void fractional_multiply(int* num1, int* num2, int* RESULTF);
  void fractional_sub(int* num1, int* num2, int* RESULTF);
  void fractional_sum(int* num1, int* num2, int* RESULTF);
%}


%token KW_EXIT

%token OP_PLUS OP_MINUS OP_DIV OP_MULT OP_OP OP_CP OP_COMMA

%union{
  int INTEGER;
  int INT_ARR[2];
  char STR[20];
};

%start START
%type<INT_ARR> EXP
%token <INT_ARR> VALUEF


%%

START : EXPLIST ;  

EXPLIST : EXP | EXPLIST EXP ;

EXP:    OP_OP OP_PLUS  EXP EXP OP_CP { fractional_sum($3, $4, $$); printf("> %d%c%d\n", $$[0], 'f', $$[1]); }
      | OP_OP OP_MINUS EXP EXP OP_CP { fractional_sub($3, $4, $$); printf("> %d%c%d\n", $$[0], 'f', $$[1]); }
      | OP_OP OP_DIV   EXP EXP OP_CP { fractional_divide($3, $4, $$); printf("> %d%c%d\n", $$[0], 'f', $$[1]); }
      | OP_OP OP_MULT  EXP EXP OP_CP { fractional_multiply($3, $4, $$); printf("> %d%c%d\n", $$[0], 'f', $$[1]); }

      | VALUEF { $$[0] = $1[0]; $$[1] = $1[1];} 

      | KW_EXIT { printf("exiting...\n"); return 0; } 
      ;



%%

void equalize_denominators(int* num1, int* num2) {
  num1[0] *= num2[1]; 
  num1[1] *= num2[1];

  num2[0] *= num1[1]; 
  num2[1] *= num1[1]; 
}

void fractional_sum(int* num1, int* num2, int* RESULTF) {
  if (num1[1] != num2[1])
    equalize_denominators(num1, num2);

  RESULTF[0] = num1[0] + num2[0];
  RESULTF[1] = num2[1];
}

void fractional_sub(int* num1, int* num2, int* RESULTF) {
  if (num1[1] != num2[1])
    equalize_denominators(num1, num2);

  RESULTF[0] = num1[0] - num2[0];
  RESULTF[1] = num2[1];
}

void fractional_divide(int* num1, int* num2, int* RESULTF) {
  RESULTF[0] = num1[0] * num2[1];
  RESULTF[1] = num1[1] * num2[0];
}

void fractional_multiply(int* num1, int* num2, int* RESULTF) {
  RESULTF[0] = num1[0] * num2[0];
  RESULTF[1] = num1[1] * num2[1];
}

void yyerror(char *str) {
    printf("yyerror: %s\n", str);
}

int main(int argc, char *argv[]){


  if(argc == 1)
    yyparse();

  else {
    printf("Input error. Exiting...\n");
    exit(1);
  }

  return 0;
}

sample output, first line is ok, but when I hit the enter after second line I get this error:

(+ 2f3 1f3)
result: 3f3
(* 2f1 2f6)
result: 4f6
fatal flex scanner internal error--end of buffer missed

Solution

  • That error message can occur in some specific circumstances involving the use of yymore() in the last token in the input, but probably the most common cause is memory corruption, which is what you've managed to do here.

    It's likely that the issue is in to_int, where you do a strcpy whose destination is an uninitialised pointer:

    void to_int(char* num, int* arr) {
      char* nominator, *denominator;
    
      strcpy(nominator, num);  // FIXME nominator is uninitialised
    

    It's actually not clear to me why you feel the need to make a copy of the argument num, since you are calling it with yytext. You're free to modify the contents of yytext as long as you don't write outside of its memory area. (The variable yyleng tells you how long yytext is.) Since strtok does not modify it's argument outside of the contents area, it's safe to apply to yytext. But if you are going to copy num, you obviously have to copy it to an actual validly initialized pointer; otherwise chaos will ensue. (Which could include triggering random flex error messages.)

    I didn't check your code thoroughly nor did I attempt to run it, so there may be other errors. In particular, I did notice a couple of problems with your token patterns:

    1. (0)|([1-9]+"f"[1-9]*) does not allow 10f2 or 2f103, since you only allow integers written with digits 1 through 9. It also allows 2f, whose meaning is opaque to me, and your to_int function could blow up on it. (At best, it would end up with a denominator of 0, which is also an error.) I'd recommend using two patterns, one for integers and the other for fractions:

      0|[1-9][0-9]* { yylval.INT_ARG[0] = atoi(yytext);
                      yylval.INT_ARG[1] = 1;
                      return VALUEF;
                    }
      0|[1-9][0-9]*f[1-9][0-9]* {
                      to_int(yytext, yylval.INT_ARR);
                      return VALUEF;
                    }
      

      But you might want to add more meaningful error messages for illegal numbers like 03f2 and 3f0.

    2. Although you don't use it anywhere, your pattern for character strings is incorrect, since (f)lex does not implement non-greedy matching. A better pattern would be \"[^"]*\" or \"[^"\n]*\" (which prohibits newlines inside strings); even better would be to allow backslash escapes with something like \"(\\.|[^"\\\n])*\". There are lots of other variants but that basically covers the general principle. (Some of us prefer ["] to \" but that's just stylistic; the meaning is the same.)

    Also, it is bad style to call exit from a yylex action. It's better to arrange for some kind of error return. Similarly, you should not use a return statement from a yyparse action, since it leaves the parser's internal state inconsistent, and does not allow the parser to free the resources it has allocated. Use YY_ACCEPT (or YY_ABORT if you want to signal a syntax error). These are described in the documentation or any good guide.