Search code examples
cbisonflex-lexer

C compiler (GCC) gives several errors when using Flex and Bison


I'm trying to make a compiler for a small little language I designed. I'm trying to use Flex as the lexical analyzer generator and Bison as the parser generator. I've read the wikipedia page on Bison, and several posts on stackoverflow on getting Flex and Bison to work together nicely. For some reason, I'm still getting errors. Here's the Makefile:

CC      = gcc
CFLAGS  = -O2 -Wall -Wextra -Wpedantic -lfl
INFILES = main.c Parser.c Lexer.c
OUTFILE = language

default: Lexer.c Parser.c
    $(CC) $(CFLAGS) $(INFILES) -o $(OUTFILE)

Lexer.c: funlang.l
    flex funlang.l

Parser.c: funlang.y Lexer.c
    bison -d -Wcounterexamples funlang.y

clean:
    rm Lexer.* Parser.* language

main.c:

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include "eval.h"



int main(int argc, char *argv[]){
 if(argc < 2){
  printf("%s error: No input file supplied\n", argv[0]);
  return 0;
 }
 if(argc > 3){
  printf("%s error: Too many arguments supplied\n", argv[0]);
  return 0;
 }
 FILE *file = fopen(argv[1], "r");
 if(file == NULL){
  printf("Error opening file %s\n", argv[1]);
  return 0;
 }
 fseek(file, 0, SEEK_END);
 size_t input_length = (size_t) ftell(file);
 rewind(file);
 size_t bytes_read = 0;
 char *input = (char *) malloc(sizeof(char) * input_length);
 bytes_read = fread(input, 1, input_length, file);
 if(bytes_read != input_length){
  puts("Error reading file");
  return 0;
 }
 fclose(file);
 
 /* I'm still not doing anything with the parser or lexer yet, because I've not gotten them to work */
 
 return 0; 
}

eval.h:

#include "Parser.h"
#include "Lexer.h"

funlang.l:

%option outfile="Lexer.c" header-file="Lexer.h"

%option bison-bridge bison-locations never-interactive reentrant
%option warn nodefault nounistd yylineno noinput nounput
%option noyywrap batch

%{
#include <stdint.h>
#include <string.h>
#include "Parser.h"
#include "Lexer.h"
size_t line_count = 1;
%}


int_literal        ((-)?((0x)[0-9A-Fa-f]+|[0-9]+))
str_literal        ("[A-Za-z0-9 \t!#-&(-/:-@[-_{-~]*")
whitespace         [ \t\r]
identifier         ([A-Za-z])([A-Za-z0-9]+)

%%
"func"                {return FUNCTION_KEYWORD; }
"if"                  {return IF;               }
"else"                {return ELSE;             }
"int"                 {return INT_KEYWORD       }
"str"                 {return STR_KEYWORD;      }
"bool"                {return BOOL_KEYWORD;     }
"true"                {return BOOL_LITERAL;     }
"false"               {return BOOL_LITERAL;     }

{int_literal}         {yylval.val = (intmax_t) strtol(yytext, (char **) NULL, 0);
                       return INT_LITERAL;      }

{str_literal}         {return STR_LITERAL;      }
"->"                  {return GIVES_TYPE;       }
{identifier}          {yylval.name = strdup(yytext);
                       return IDENTIFIER;       }
\n                    {line_count++; }
{whitespace}
.                     {printf("Error, unrecognized char at line #%li", line_count); 
                       return OTHER;            }

%%

and funlang.y:

%define api.pure full
%locations
%param {yyscan_t scanner}

%code top{
 #include <stdio.h>
 #include <stdint.h>
}

%code requires{
 typedef void* yyscan_t;
}

%code{
 int yylex(YYSTYPE* yylvalp, YYLTYPE* yyllocp, yyscan_t scanner);
}

%{
#include "Parser.h"
%}

%token FUNCTION_KEYWORD IF ELSE INT_KEYWORD STR_KEYWORD BOOL_KEYWORD
%token BOOL_LITERAL STR_LITERAL GIVES_TYPE OTHER
%token <val>  INT_LITERAL
%token <name> IDENTIFIER

%output  "Parser.c"
%defines "Parser.h"

%union{
 char name[16];
 intmax_t val;
};


%%

function: FUNCTION_KEYWORD IDENTIFIER '(' parameters ')' GIVES_TYPE INT_KEYWORD '{' statement '}'
|         FUNCTION_KEYWORD IDENTIFIER '(' parameters ')' GIVES_TYPE STR_KEYWORD '{' statement '}'
|         FUNCTION_KEYWORD IDENTIFIER '(' parameters ')' GIVES_TYPE BOOL_KEYWORD '{' statement '}';

parameters: parameter
|           %empty;

parameter: INT_KEYWORD                IDENTIFIER
|          parameter ',' INT_KEYWORD  IDENTIFIER
|          STR_KEYWORD                IDENTIFIER
|          parameter ',' STR_KEYWORD  IDENTIFIER
|          BOOL_KEYWORD               IDENTIFIER
|          parameter ',' BOOL_KEYWORD IDENTIFIER;

call_parameters: parameter
|                %empty;


call_parameter: IDENTIFIER
|               parameter ',' IDENTIFIER;

array_list: INT_LITERAL
|           IDENTIFIER
|           function_call
|           indexing_expression
|           STR_LITERAL
|           BOOL_LITERAL
|           array_list ',' IDENTIFIER
|           array_list ',' function_call
|           array_list ',' indexing_expression
|           array_list ',' INT_LITERAL
|           array_list ',' STR_LITERAL
|           array_list ',' BOOL_LITERAL;


var_definition: INT_KEYWORD IDENTIFIER '=' INT_LITERAL ';'
|               INT_KEYWORD IDENTIFIER '=' math_expression ';'
|               INT_KEYWORD IDENTIFIER '=' function_call ';'
|               INT_KEYWORD IDENTIFIER ';'
|               INT_KEYWORD indexing_expression '=' array_list ';'
|               INT_KEYWORD indexing_expression ';'
|               STR_KEYWORD IDENTIFIER '=' STR_LITERAL ';'
|               STR_KEYWORD IDENTIFIER '=' function_call ';'
|               STR_KEYWORD IDENTIFIER ';'
|               STR_KEYWORD indexing_expression '=' array_list ';'
|               STR_KEYWORD indexing_expression ';'
|               BOOL_KEYWORD IDENTIFIER '=' BOOL_LITERAL ';'
|               BOOL_KEYWORD IDENTIFIER '=' function_call ';'
|               BOOL_KEYWORD IDENTIFIER ';'
|               BOOL_KEYWORD indexing_expression '=' array_list ';'
|               BOOL_KEYWORD indexing_expression ';';


function_call: IDENTIFIER '(' call_parameters ')';

boolean_statement: '(' IDENTIFIER "==" IDENTIFIER ')'
|                  '(' IDENTIFIER ')'               
|                  '(' indexing_expression ')'     
|                  '(' boolean_statement ')'
|                  '(' boolean_statement "&&" boolean_statement ')'
|                  '(' boolean_statement "||" boolean_statement ')'
|                  '(' '!' boolean_statement ')'
|                  '(' function_call ')';            

BINARY_OPERATOR: '+'| '-' | '*' | '/' | '&' | '|';

math_expression: '(' BINARY_OPERATOR INT_LITERAL       INT_LITERAL         ')'
|                '(' BINARY_OPERATOR INT_LITERAL       IDENTIFIER          ')'
|                '(' BINARY_OPERATOR IDENTIFIER        INT_LITERAL         ')'
|                '(' BINARY_OPERATOR INT_LITERAL       function_call       ')'
|                '(' BINARY_OPERATOR function_call     INT_LITERAL         ')'
|                '(' BINARY_OPERATOR function_call     function_call       ')'
|                '(' BINARY_OPERATOR INT_LITERAL       math_expression     ')'
|                '(' BINARY_OPERATOR math_expression   INT_LITERAL         ')'
|                '(' BINARY_OPERATOR math_expression   IDENTIFIER          ')'
|                '(' BINARY_OPERATOR IDENTIFIER        math_expression     ')'
|                '(' BINARY_OPERATOR math_expression   function_call       ')'
|                '(' BINARY_OPERATOR function_call     math_expression     ')'
|                '(' BINARY_OPERATOR math_expression   math_expression     ')';

indexing_expression: IDENTIFIER '[' INT_LITERAL ']'
|                    IDENTIFIER '[' IDENTIFIER  ']'; 

statement: IF boolean_statement '{' statement '}'
|          ELSE '{' statement '}'
|          "return" function_call  ';'
|          "return" IDENTIFIER     ';'
|          "return" INT_LITERAL    ';'
|          "return" STR_LITERAL    ';'
|          "return" BOOL_LITERAL   ';'
|          var_definition;

%%

The big issue I'm having is that when compiling everything, there are several stages in which the C compiler says that 'yyin', 'yyout', 'yyleng' and several others are undeclared. E.g., Lexer.c:800:10: error: ‘yyin’ undeclared (first use in this function)

I'm sorry if I left out any important information, or if this is otherwise unclear. I will do my best to clarify any ambiguities. Thank you in advance


Solution

  • The undefined symbol errors are all the result of you putting

    #include "Lexer.h"
    

    in your generated lexer. You must never do that; the lexer header information is already included in the generated lexer and including it twice leads to errors.

    Similarly, you should not put

    #include "Parser.h"
    

    into the generated parser. Again, the generated parser already has the necessary definitions, and the double include can lead to problems.

    I tried to show the outline of a reentrant bison/flex project in this answer. That answer carefully uses only the necessary includes, and tries to explain some of the rationale.

    Also note that yyerror is necessary for the compilation to succeed. The version in the above answer should be a good start.


    Additional notes

    There are some additional errors in your code.

    1. At line 25 of funlang.l, there is a missing ;

    2. In the actions at lines 31 and 36, you need to take into account this paragraph (from the linked answer on reentrant parsers):

      All references to yylval in scanner actions also need to be modified, since bison's reentrant API passes pointers to the semantic value and location objects. If the semantic type is a union (normally produced by placing a %union declaration in the bison source), then you'll need to change scanner actions which use yylval.tag to yylval->tag. Similarly, if you use a single semantic type, either the default type or one declared (in the bison source) with %define api.value.type, then you'll need to replace yylval = ... with *yylval = ..., as in the sample code above.

      Even with that fix, yylval->name = strdup(yytext) will not work because your parser's %union declaration defines name as a fixed-length character array, and you can't assign to an array. I'd strongly recommend changing name to type char*; that will be compatible with the use in your lexical scanner, and will not cause buffer overruns if someone happens to supply a long identifier.

    3. Your regular expression for character string literals is wrong, since it's a quoted string. In (f)lex, patterns can contain quoted strings, which proves to be quite useful but sadly is not implemented in any regex library I know of. You need to escape the " as \" to get what you're looking for. It's also quite hard to read; I'd suggest using a negated character class to show which characters you're excluding. But that's just a suggestion.

    4. You have %option yylineno which will get flex to track line numbers for you. There's a definition of YY_USER_ACTION in my lexer.l which uses that to populate the location objects, so that the token locations are available to bison. If you use that code, you won't need to track line numbers yourself and so you won't need the line_no variable or the rule for \n, which could just be added to your whitespace rule.

    5. With respect to the whitespace rule, flex -- unlike lex -- usually allows rules without actions, but that's not correct and may lead to problems, particularly if at some moment you use a different scanner generator. Rules must have actions; if you want a pattern to do nothing, use the action ; (which is the usual convention) or {}.

    6. There's no need to read the input into memory, as your main does. The (f)lex scanner handles all the details of reading the file, no matter how long it might be.