Search code examples
stringcompiler-constructionbison

Bison/Flex String Token Recognition


What I have to write instead of

identifier      [a-zA-Z0-9]+

in order to accept also a string done by only numbers?

I wrote new bison and flex files in order to make clear my issue. Bison File:

%{
#include <stdio.h>
#include <string>
using namespace std;
extern int yylex();
extern void yyerror(char*);
%}

//Symbols
%union
{
    double double_val;
    char *str_val;
};

%token START
%token STOP
%token BEGIN_NUM
%token END_NUM
%token BEGIN_STRING
%token END_STRING

%token <double_val> NUMBER
%token <str_val>    IDENTIFIER

%start MyTest

%%

MyTest:
    START Block STOP
    ;

Block:
    /* empty */
    | Block BEGIN_STRING IDENTIFIER END_STRING { printf("received string: %s \n", $3); }
    | Block BEGIN_NUM NUMBER END_NUM { printf("received number: %f \n", $3); }
    ;

%%

Flex file:

%{
#include <string>
#include "test.tab.h"
void yyerror(char*);
int yyparse(void);
%}
blanks          [ \t\n]+
identifier      [a-zA-Z0-9]+
number          [0-9][0-9]*(.[0-9]+)?
%%

{blanks}        { /* ignore */ };

"<test>"        return(START);
"</test>"   return(STOP);
"<string>"      return(BEGIN_STRING);
"</string>"     return(END_STRING);
"<num>"     return(BEGIN_NUM);
"</num>"        return(END_NUM);

{number}        { yylval.double_val = atof(yytext);
                  return(NUMBER);
                }
{identifier}    { 
                  yylval.str_val=strdup(yytext);
                  return(IDENTIFIER);
                }

%%

void yyerror (char* str){ printf (" ERROR : Could not parse! %s\n", str );}
int yywrap (void){ }
int main(int num_args, char** args){
    if(num_args != 2) {printf("usage: ./parser filename\n"); exit(0);}
    FILE* file = fopen(args[1],"r");
    if(file == NULL) {printf("couldn't open %s\n",args[1]); exit(0);}
    yyin = file;
    yyparse();
    fclose(file);
}

Everything is working when I give in input this file:

<test>
<num>1</num>
<string>eeeeee</string>
<num>2</num>
<string>cccc</string>
<num>3</num>
<num>4</num>
<string>asaa</string>
<string>dsa</string>
</test>

But if I change one field of string with a value with only digits like:

<string>323</string>

I get syntax error...


Solution

  • A string of only digits returns the token NUMBER, so you could add a rule:

    Block: Block BEGIN_STRING NUMBER END_STRING { printf("received number as string: %f \n", $3); }
    

    Alternatenly, have your lexer just return TEXT tokens for everything not in <..>, and use that everywhere:

    [^<>]+  { 
                  yylval.str_val=strdup(yytext);
                  return(TEXT);
            }
    

    In addition, your lexer should have a rule like:

    .      fprintf(stderr, "Ignoring unknown character '%c'\n", *yytext);
    

    or

    .      return *yytext;
    

    at the end.

    Without such a rule, odd characters in your input will just be echoed to the output, which is almost certainly not what you want for a compiler (though it may be fine for a simple text processor that is just making some changes to the input, and leaving the rest alone as a pass-through). Which you want depends on how you want to handle errors -- if you have no error recovery rules in your grammar and just want to ignore the extra characters, the former is fine, while if you've implemented an error recovery scheme in your grammar, the latter gives any extra characters as single tokens to the parser, where your error recovery can do something smarter with them.