Search code examples
c++includelex

Explanation for implementation of C/C++ nested `#include "Header"` grammar by lex/flex?


I'm study the Start States and Nested Input Files in lex/flex.

In book flex and bison, I'm confused about the example implementation of C/C++ #include "Header" grammar:

Here's part of example lex:

/* Companion source code for "flex & bison", published by O'Reilly
 * Media, ISBN 978-0-596-15597-1
 * Copyright (c) 2009, Taughannock Networks. All rights reserved.
 * See the README file for license conditions and contact info.
 * $Header: /home/johnl/flnb/code/RCS/fb2-3.l,v 2.3 2010/01/04 02:43:58 johnl Exp $
 */

/* fb2-3 skeleton for include files */

%option noyywrap warn nodefault
%x IFILE
  struct bufstack {
    struct bufstack *prev;  /* previous entry */
    YY_BUFFER_STATE bs;     /* saved buffer */
    int lineno;         /* saved line number */
    char *filename;     /* name of this file */
    FILE *f;            /* current file */
  } *curbs = 0;

  char *curfilename;        /* name of current input file */

  int newfile(char *fn);
  int popfile(void);

%%
^"#"[ \t]*include[ \t]*[\"<] { BEGIN IFILE; }

<IFILE>[^ \t\n\">]+          { 
                             { int c;
                   while((c = input()) && c != '\n') ;
                 }
                 yylineno++;
                 if(!newfile(yytext))
                                yyterminate(); /* no such file */
                 BEGIN INITIAL;
                           }

<IFILE>.|\n                { fprintf(stderr, "%4d bad include line\n", yylineno);
                     yyterminate();
               }
^.                         { fprintf(yyout, "%4d %s", yylineno, yytext); }
^\n                        { fprintf(yyout, "%4d %s", yylineno++, yytext); }
\n                         { ECHO; yylineno++; }
.                          { ECHO; }
<<EOF>>                    { if(!popfile()) { fprintf(yyout, "end of file, total lines: %4d %s", yylineno, yytext); yyterminate();}  }
%%

main(int argc, char **argv)
{
  if(argc < 2) {
    fprintf(stderr, "need filename\n");
    return 1;
  }
  if(newfile(argv[1]))
    yylex();
}

int
  newfile(char *fn)
{
  FILE *f = fopen(fn, "r");
  struct bufstack *bs = malloc(sizeof(struct bufstack));

  /* die if no file or no room */
  if(!f) { perror(fn); return 0; }
  if(!bs) { perror("malloc"); exit(1); }

  /* remember state */
  if(curbs)curbs->lineno = yylineno;
  bs->prev = curbs;

  /* set up current entry */
  bs->bs = yy_create_buffer(f, YY_BUF_SIZE);
  bs->f = f;
  bs->filename = fn;
  yy_switch_to_buffer(bs->bs);
  curbs = bs;
  yylineno = 1;
  curfilename = fn;
  return 1;
}

int
  popfile(void)
{
  struct bufstack *bs = curbs;
  struct bufstack *prevbs;

  if(!bs) return 0;

  /* get rid of current entry */
  fclose(bs->f);
  yy_delete_buffer(bs->bs);

  /* switch back to previous */
  prevbs = bs->prev;
  free(bs);

  if(!prevbs) return 0;

  yy_switch_to_buffer(prevbs->bs);
  curbs = prevbs;
  yylineno = curbs->lineno;
  curfilename = curbs->filename;
  return 1; 
}

Please help me with these questions:

  1. Why <IFILE>[^ \t\n\">]+ matches the end " or > of the Header?
  2. Why use { int c; while((c = input()) && c != '\n') ; } eats all the chars until end of the line \n? Will yytext match exactly the Header file name?
  3. How to implement grammar like java import java.util.Decoder; ?

Solution

  • Why [^ \t\n\">]+ matches the end " or > of the Header?

    The answer is: It doesn't.

    But what it do is to match all characters until those (and space, tab and newline), and the matching stops when you reach those characters. So when you have a match and the code for the rule is executed, you know that the next character in the file after the match must be either a ", a >, or a white-space character.

    Lets take an example:

    #include <foo/bar.h>
    
    • The rule ^"#"[ \t]*include[ \t]*[\"<] matches #include <.
    • The rule <IFILE>[^ \t\n\">]+ matches foo/bar.h

    When you then run the code

    int c;
    while((c = input()) && c != '\n') ;
    

    it will start by reading the ending >, and then continue to read and discard all the remaining characters until the end of the line.

    To verify this you could add some output of the character in the loop.