How do I parse the type of custom defined datatypes in header file of a C code using bison/flex?

I am learning bison/flex. I successfully parse a simple c code file with bison/flex. Now I was wondering on parsing the included header file in a test c code using bison/flex. Will it can do that ?

To put in a simpler way, I am attaching sample code to give idea of my question.

Here is the Test file that includes a header file (.h) also.

test.c which includes a header file header.h

#include <stdio.h>
#include "header.h"

int main (int c, int b) 
{
    bigNumber a;                       /* I wanted that when parser come across to 
                                          "bigNumber" then it knows the datatype of 
                                          "bigNumber" and print its 
                                          type as defined in "header.h" */
    while ( 1 ) 
    {

    newData d;                            /* Same should happen to "newData" also */

    }
}

header.h

#define newData int
#define bigNumber double

lexer.l

%{

#include <stdio.h>
#include <string.h>
#include "c.tab.h"

%}

alpha [a-zA-Z]
digit [0-9]

%%

[ \t]                           { ; }
[ \n]                           { yylineno = yylineno + 1;}
int                             { return INT; }
float                           { return FLOAT; }
char                            { return CHAR; }
void                            { return VOID; }
double                          { return DOUBLE; }
for                             { return FOR; }
while                           { return WHILE; }
if                              { return IF; }
else                            { return ELSE; }
printf                          { return PRINTF; }
struct                          { return STRUCT; }
^"#include ".+                  { ; }
{digit}+                        { return NUM; }
{alpha}({alpha}|{digit})*       { return ID; }
"<="                            { return LE; }
">="                            { return GE; }
"=="                            { return EQ; }
"!="                            { return NE; }
">"                             { return GT; }
"<"                             { return LT; }
"."                             { return DOT; }
\/\/.*                          { ; }
\/\*(.*\n)*.*\*\/               { ; }
.                               { return yytext[0]; }

%%

bison file (c.y)

%{
#include <stdio.h>
#include <stdlib.h>

#include"lex.yy.c"
#include<ctype.h>
int count=0;

extern FILE *fp;

%}

%token INT FLOAT CHAR DOUBLE VOID
%token FOR WHILE
%token IF ELSE PRINTF
%token STRUCT
%token NUM ID
%token INCLUDE
%token DOT

%right '='
%left AND OR
%left '<' '>' LE GE EQ NE LT GT
%%

start
    : Function
    | Declaration
    ;

/* Declaration block */
Declaration
    : Type Assignment ';'
    | Assignment ';'
    | FunctionCall ';'
    | ArrayUsage ';'
    | Type ArrayUsage ';'
    | StructStmt ';'
    | error
    ;

/* Assignment block */
Assignment
    : ID '=' Assignment
    | ID '=' FunctionCall
    | ID '=' ArrayUsage
    | ArrayUsage '=' Assignment
    | ID ',' Assignment
    | NUM ',' Assignment
    | ID '+' Assignment
    | ID '-' Assignment
    | ID '*' Assignment
    | ID '/' Assignment
    | NUM '+' Assignment
    | NUM '-' Assignment
    | NUM '*' Assignment
    | NUM '/' Assignment
    | '\'' Assignment '\''
    | '(' Assignment ')'
    | '-' '(' Assignment ')'
    | '-' NUM
    | '-' ID
    |   NUM
    |   ID
    ;

/* Function Call Block */
FunctionCall 
    : ID'('')'
    | ID'('Assignment')'
    ;

/* Array Usage */
ArrayUsage 
    : ID'['Assignment']'
    ;

/* Function block */
Function
    : Type ID '(' ArgListOpt ')' CompoundStmt
    ;

ArgListOpt
    : ArgList
    |
    ;

ArgList
    : ArgList ',' Arg
    | Arg
    ;

Arg
    : Type ID
    ;

CompoundStmt
    : '{' StmtList '}'
    ;

StmtList
    : StmtList Stmt
    |
    ;

Stmt
    : WhileStmt
    | Declaration
    | ForStmt
    | IfStmt
    | PrintFunc
    | ';'
    ;

/* Type Identifier block */
Type
    : INT
    | FLOAT
    | CHAR
    | DOUBLE
    | VOID
    ;

/* Loop Blocks */
WhileStmt
    : WHILE '(' Expr ')' Stmt
    | WHILE '(' Expr ')' CompoundStmt
    ;

/* For Block */
ForStmt
    : FOR '(' Expr ';' Expr ';' Expr ')' Stmt
    | FOR '(' Expr ';' Expr ';' Expr ')' CompoundStmt
    | FOR '(' Expr ')' Stmt
    | FOR '(' Expr ')' CompoundStmt
    ;

/* IfStmt Block */
IfStmt 
    : IF '(' Expr ')' Stmt
    ;

/* Struct Statement */
StructStmt 
    : STRUCT ID '{' Type Assignment '}'
    ;

/* Print Function */
PrintFunc 
    : PRINTF '(' Expr ')' ';'
    ;

/*Expression Block*/
Expr
    :
    | Expr LE Expr
    | Expr GE Expr
    | Expr NE Expr
    | Expr EQ Expr
    | Expr GT Expr
    | Expr LT Expr
    | Assignment
    | ArrayUsage
    ;

%%

int main(int argc, char *argv[])
{
    yyin = fopen(argv[1], "r");

   if(!yyparse())
        printf("\nParsing complete\n");
    else
        printf("\nParsing failed\n");

    fclose(yyin);
    return 0;
}

yyerror(char *s) {
    printf("%d : %s %s\n", yylineno, s, yytext );
}

int yywrap()
{
    return 1;
}

what modification should be done in lexer (.l) and bison (.y) file so that while parsing c file, if that c file includes some header file then it go to that header file reads it and return to original test c file and if the custom defined datatype is present then it will know its datatype from header file and prints it.

Will it can be possible? What modification I have to make ? Thank you

Solution

Flex has a feature which makes it relatively easy to handle things like C's #include directive. It's described at length in the flex manual chapter on multiple input buffers, with code examples, and you should refer to that document for precise details. (I put some sample code at the end of this answer.)

In the flex manual, the scanner itself recognizes the #include directive and handles the inclusion transparently; the parser never sees the directive. That has a certain appeal; the parser only needs to parse a stream of tokens, and the lexical analyser takes full responsibility for producing the token stream, which includes reading from included files.

But as your header.h shows, handling #include directives is not all that is required. To actually parse C, you need to implement the C preprocessor, or at least as much of it as you care about. That includes being able to #define macros, and it also includes substituting the macro definition for any use of the macro in the program. That's a much more complicated process. And that's not all, because the preprocessor also allows conditional inclusion (#ifdef, #if, etc.). In order to implement the #if directive, you need to be able to parse and evaluate an arbitrary arithmetic expression (without variables but with macro substitution), which might best be done by calling a C expression parser.

There are various ways of structuring a solution which includes a preprocessor. One possibility is to implement the entire preprocessor in the lexical scanner, consistent with the sample code included in this answer. However, as can also be seen in that sample code, such an implementation will be quite irritating because it will basically involve writing a state machine for each preprocessor directive, something which could much more easily be accomplished with a parser generator.

So a second possibility is to embed the preprocessor in the C parser. But that will require a lot of communication from the parser to the lexer, complicated by the fact that the parse is not usually synchronised with the lexical analysis (because the parser often has read a lookahead token which has not yet been parsed when parser actions execute). For example, if the macro definition mapping is kept in the parser, then the parser will have to be able to push replacement tokens onto the input stream so that they are subsequently read before the lookahead token.

And yet another possibility is to put the preprocessor as a third component, between the lexer and the parser. Bison can produce "push-parsers", in which the parser is called with each successive token rather than calling yylex every time it needs a token. That's a much more convenient interface for integrating with a macro preprocessor. In this model, the preprocessor could be implemented as a separate bison-generated parser which reads tokens from the lexer in the normal way and feeds them one at a time to the C parser using the push API.

The full preprocessor implementation is not as complicated as a C compiler. But it's not something which can be summarised in a few paragraphs in a Stack Overflow answer.

So the best I can do here is provide a simple implementation (adapted from the flex manual) for the buffer state stack implementation of the #include directive. I assume that you are familiar with (f)lex "start conditions", which are used to build a simple state machine for parsing preprocessor directives. If not, see the previous chapter of the flex manual.)

%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
%}

%x PP PP_INCLUDE PP_INCLUDE_SKIP
%%
  /* If a line starts with a #, handle it as a preprocessor directive */
^[[:blank:]]*#     { begin(PP); }
  /* The rest of your rules go here */
  /* ... */
 
<PP>{
  /* This state parses only the first word of the directive. */
  [[:blank:]]+                        ; /* Ignore whitespace */
  "/*"[^*]*[*]+([^*/][^*]*[*]+)*"/"   ; /* Also ignore comments */
  include   { begin(PP_INCLUDE); }      /* include directive */
      /* Handle other preprocessor directives here */
  \n        { begin(INITIAL); }         /* null directive does nothing */
  .+        { yyerror("Invalid preprocessor directive"); }
}
<PP_INCLUDE>{
  /* This state parses and handles #include directives */
  ["][^"]+["]  |
  [<][^>]+[>]  { yytext[yylen - 1] = 0;
                 FILE* incl = find_include_file(yytext + 1);
                 if (incl) {
                   yyin = incl;
                   yypush_buffer_state(yy_create_buffer(yyin, YY_BUF_SIZE));
                   BEGIN(INITIAL);
                 }
                 else {
                   fprintf(stderr, "Could not find include file %s\n", yytext + 1);
                   /* You might want to stop the parse instead of continuing. */
                   BEGIN(PP_INCLUDE_SKIP);
                 }
               }
<PP_INCLUDE_SKIP,PP_INCLUDE>{
  /* PP_INCLUDE_SKIP is used to ignore the rest of the preprocessor directive,
   * producing an error if anything is on the line other than a comment.
   */
  [[:blank:]]+                        ; /* Ignore whitespace */
  "/*"[^*]*[*]+([^*/][^*]*[*]+)*"/"   ; /* Also ignore comments */
  .+|\n        { yyerror("Invalid #include directive");
                 BEGIN(INITIAL);
               }
}
<*><<EOF>>  { yypop_buffer_state();
              /* The buffer stack starts with a buffer reading from yyin.
               * If the EOF was found in the initial input file, the stack will
               * be empty after the pop, and YY_CURRENT_BUFFER will be NULL. In
               * that case, the parse is finished and we return EOF to the caller.
               * Otherwise, we need to  skip the rest of the #include directive
               * and continue producing tokens from where we left off. 
               */
              if (YY_CURRENT_BUFFER)
                BEGIN(PP_INCLUDE_SKIP);
              else
                return 0;
            }