Search code examples
bisonlexflex-lexer

How to scan tokens only within context using Flex?


I want to create a parser of template engine using Flex & Bison. The thing is that I would like to parse only expressions within {{..}} and ${..}.

The template can be any arbitrary text with embedded tokens with code like this:

        </table:table-row>
        {{$(/report/row.xml).embed()}}
        {{$(//Accreditation/AccreditationDocument/Report).each(fragment(row) """
            <table:row>
                <table:table-cell office:value-type="string" office:string-value="${row["name"]}" />
            </table:row>
        """)}}
        <table:table-row table:number-rows-repeated="1048574" table:style-name="ro1">
            <table:table-cell table:number-columns-repeated="16384"/>
        </table:table-row>
    </table:table>

Solution

  • I have found the solution myself. Flex has a feature called Start Conditions.

    Below is the lexer.l code which returns tokens only from {{ }}. Other text is returned as GENERAL_BODY.

    %{
    #include "bisondef.h"
    %}
    
    %option reentrant noyywrap never-interactive nounistd
    %option bison-bridge
    
    WS [ \t\n]+
    ID [A-z_][[:alnum:]]*
    
    %x stmt
    
    %%
        int stmt_level = 0;
    
    "{{"    { stmt_level = 0; BEGIN(stmt); }
    
    <stmt>{
        "{{"    { stmt_level++; printf("stmt {{\n"); }
        "}}"    {
            if (0 == stmt_level) BEGIN(INITIAL);
            else stmt_level--;
        }
        {WS}    {}
        [0-9]+  { yylval->num = atoi(yytext); return NUM; }
        "+"|"-"|"*"|"/"|"("|")" { return *yytext; }
        ";"     { return SEMICOLON; }
        {ID}    { yylval->str = strdup(yytext); return ID; }
    }
    
    .  {
        yylval->str = strdup(yytext);
        return GENERAL_BODY;
    }
    
    %%
    
    int yyerror(const char *msg) { fprintf(stderr,"Error: %s\n",msg); return 0; }