Search code examples
regexflex-lexerlexdiagnostic-tools

Where can the default rule be matched?


I have been slowly getting used to the whims of the flex lexer and, while it's pretty cool once (1) I have manage to get it to actually work and (2) I have learned just how it works, I am not sure how to improve upon my grammar definition.

The grammar is medium-complex, so I would prefer not to paste it here. I added it - see below But here are the build stats.

/usr/local/bin/flex -P -d -v 
classic.lpp:466: warning, -s option given but default rule can be matched
flex version 2.6.4 usage statistics:
  scanner options: -+dsvB8 -Cem -Pc
  314/2000 NFA states
  114/1000 DFA states (563 words)
  40 rules
  Compressed tables always back-up
  7/40 start conditions
  187 epsilon states, 127 double epsilon states
  26/100 character classes needed 322/500 words of storage, 0 reused
  1385 state/nextstate pairs created
  250/1135 unique/duplicate transitions
  123/1000 base-def entries created
  545/2000 (peak 748) nxt-chk entries created
  72/2500 (peak 333) template nxt-chk entries created
  220 empty table entries
  14 protos created
  9 templates created, 18 uses
  37/256 equivalence classes created
  8/256 meta-equivalence classes created
  2 (5 saved) hash collisions, 151 DFAs equal
  0 sets of reallocations needed
  1629 total table entries needed

Apart from the fact that I don't really know what the above means (except from things which are obvious like the number of rules and start conditions), what I don't know is how to get flex to tell me where the default rule can be matched.

Of course I can add a default rule 'catch'

But that means I have to find a means of triggering it. Again, I can ruthlessly draw up a regex/match analysis and walk through it until I find what is missing - but is there no way that flex can tell me eg: "in start rule 3, the following characters are not being matched??


minimal example of the grammar follows

flex version 2.6.4

/usr/local/bin/flex -P -d -v adv.lpp

%option debug
%option noyywrap noinput nounput nodefault
%option noyymore
%option never-interactive
%option c++
%option stack
%option warn
%x parm brace bracket fatone literal injection

DELIMITER           ,
ALTDELIM                ⁏
BRACKET_IN          \(
BRACKET_OUT         \)
FAT_ONE_IN          ❪
FAT_ONE_OUT         ❫
LITERAL_IN          ⎣
BRACE_IN                ⎡
BRACE_OUT           ⎤
INJECT_SIGN         ⍟
I_OFFSET                \^*
I_NUMBER                [0-9]+
I_ITS                   [ijkn]
I_ITK                   [KN]
I_STACK             p([s]|[0-9]+)?
I_ITER              ([ijknKN]([+-][0-9]+)?([./]p?[0-9]+)?)
I_MORE              ([0-9]+("+"?))
MACRO_SIGN          ⌽
MACRO_NAME          [a-zA-Z_0-9]+
WHITESPACE          [\t\x0d\x0a]+
LCOMMENT                ⌽comment\(
LBCOMMENT           ⌽comment❪
LITERAL_OUT         ⎦
NOT_SPECIAL_1C      [^\t\x0d\x0a,\(\)\xe2]
NOT_LITERAL_OUT [^\xe2]|(\xe2(([^\x8e][^\0])|(\x8e[^\xa6])))
NOT_SPECIAL_3C      (\xe2[^\x81\x8c\x8d\x8e\x9d][^\0])|(\xe2\x81[^\x8f])|(\xe2\x8c[^\xbd])|(\xe2\x8d[^\x9f])|(\xe2\x8e[^\xa1\xa3\xa4\xa6])|(\xe2\x9d[^\xaa\xab])
JUST_TEXT           ({NOT_SPECIAL_1C}|{NOT_SPECIAL_3C})+

/**
   E2 81|8C|8D|8E|9D
 ❪ E2 9D AA √ FAT_ONE_IN
 ❫ E2 9D AB √ FAT_ONE_OUT
 ⁏ E2 81 8F √ ALTDELIM
 ⌽ E2 8C BD √ MACRO_SIGN
 ⍟ E2 8D 9F √ INJECT_SIGN
 ⎡ E2 8E A1 √ BRACE_IN
 ⎣ E2 8E A3 √ LITERAL_IN
 ⎤ E2 8E A4 √ BRACE_OUT
 ⎦ E2 8E A6 √ LITERAL_OUT
 **/

%%

<literal>{
    {LCOMMENT} {
        yy_push_state(parm);
        return( token::MACRO_IN );
    }
    
    {LBCOMMENT} {
        yy_push_state(parm);
        return( token::MACRO_IN );
    }
    
    {LITERAL_IN} {
        yy_push_state(literal);
    }
    
    {LITERAL_OUT} {
        yy_pop_state(); /**  Pop Literal **/
    }

    {NOT_LITERAL_OUT} {
        return( token::TEXT );
    }

    <<EOF>> {
        cerr << "Unexpected end of text before close literal";
        yy_pop_state();
    }
}

<bracket>{
    {BRACKET_OUT} {
        yy_pop_state();
        return( token::TEXT );
    }
}

<fatone>{
    {FAT_ONE_OUT} {
        yy_pop_state();
        return( token::TEXT );
    }
}

<parm,brace,bracket,fatone>{
    {INJECT_SIGN}({I_ITK}|{BRACKET_IN}{I_ITK}{BRACKET_OUT}) {
        return(token::INJECTION);
    }
}

<parm>{
    {DELIMITER} {
        return( token::PARM );
    }
    
    {ALTDELIM} {
        return( token::PARM );
    }
            
    {BRACE_OUT} {
        return( token::TEXT );
    }

    {BRACKET_OUT} {
        yy_pop_state(); /**  Pop Macro **/
        return( token::MACRO_OUT );
    }

    {FAT_ONE_OUT} {
        yy_pop_state(); /**  Pop Macro **/
        return( token::MACRO_OUT );
    }
    
    <<EOF>> {
        cerr << "Unexpected end of text before macro close.";
        yy_pop_state(); //pop parm!
    }
}

<brace>{
    
    {BRACE_OUT} {
        yy_pop_state();
    }

    {ALTDELIM} {
        return (token::PARMX);
    }

    <<EOF>> {
        cerr << "Unexpected end of text before close brace";
        yy_pop_state();
    }

}

<brace,bracket,fatone>{
    {DELIMITER} {
        return( token::TEXT );
    }
}


<INITIAL,parm,brace,bracket,fatone>{

    {LITERAL_IN} {
        yy_push_state(literal);
    }
    
    ({WHITESPACE}) {
        return( token::WSS );
    }
    
    {BRACE_IN} {
        yy_push_state(brace);
    }

    {MACRO_SIGN}{MACRO_NAME}{BRACKET_IN} {
        yy_push_state(parm);
        return( token::MACRO_IN);
    }
    
    {MACRO_SIGN}{MACRO_NAME}{FAT_ONE_IN} {
        yy_push_state(parm);
        return( token::MACRO_IN );
    }
    
    {MACRO_SIGN}{MACRO_NAME}{NOT_SPECIAL_1C} {
        return( token::TEXT );
    }

    {INJECT_SIGN}{I_OFFSET}({I_NUMBER}|{I_ITS}|({BRACKET_IN}{I_OFFSET}({I_STACK}|{I_ITER}|{I_MORE}){BRACKET_OUT})) {
        return(token::INJECTION);
    }
    
    {BRACKET_IN} {
        switch(YY_START) {
            case bracket: 
                case parm: {
                yy_push_state(bracket);
            } break;
            default: break;
        }
        return( token::TEXT );
    }

    {FAT_ONE_IN} {
        switch(YY_START) {
            case fatone:
            case parm: {
                yy_push_state(fatone);
            } break;
            default: break;
        }
        return( token::TEXT );
    }

    {JUST_TEXT} {
        return( token::TEXT );
    }
    
    {MACRO_SIGN}|{INJECT_SIGN}|{BRACKET_OUT}|{FAT_ONE_OUT}|{LITERAL_OUT} {
        return( token::TEXT );
    }
}

<INITIAL>{

    ({DELIMITER}|{BRACKET_OUT}|{FAT_ONE_OUT}|{BRACE_OUT}|{ALTDELIM}|{LITERAL_OUT}) {
        return(token::TEXT);
    }

}


%%

Solution

  • Unfortunately, flex does not provide such a feature.

    You can get a clue by trying every single character in every start condition until the scanner reports an error. That's just a few lines of code, using yy_scan_buffer to set up a single character input. [See code sample below].

    Don't forget to try an EOF in every start condition; a common cause of that warning is a start condition which doesn't handle EOF. Sometimes that's a false positive, but a lot of times it's because you just didn't think about all the possible places an EOF might erroneously occur.

    False positives can happen when a start condition is entered when you know what the next input character is, perhaps because the rule has trailing context or perhaps because you called <yyless. In such a case, you could reasonably fail to match a character sequence you know can't occur, but better style is probably to catch the sequence or EOF and interpose your own error message.

    One particular case to look for is patterns for delimited tokens like strings or comments, where you haven't handled the possibility of a token missing its closing delimiter. In the case of strings, you also need to think about the possibility that the input ends in the middle of an escape sequence.


    Using the code snippet below, I got the following report:

    Failed to match 0XE2 in state INITIAL
    Failed to match 0XE2 in state parm
    Failed to match 0XE2 in state brace
    Failed to match 0XE2 in state bracket
    Failed to match 0XE2 in state fatone
    Failed to match 0XE2 in state literal
    

    Basically, you recognise a multibyte characters starting with 0xE2, and you recognise bytes other than 0xE2. But what about an input which hits EOF in the middle of the multibyte sequence starting 0xE2? (Yes, it's an invalid input. But Flex doesn't know that.)


    In case it's useful, here's a little code snippet which can be used to determine what character cannot be matched. In order to use this, you'll need to:

    • Modify the list of states at the top of main
    • Change all flex actions to return 1; (which can be mostly be done with a good text editor, by just injecting return 1; after the leading { of each action).
    • Add catch-all rules at the end, in order to avoid triggering a fatal error when the unhandled character is tried:
      <*>.|\n          { return 2; }
      <INITIAL><<EOF>> { return 0; /* Needed so the next rule doesn't apply. */}
      <<EOF>>          { return 2; }
      
    • Remove any reference to any symbol not defined in the Flex file (support functions, token definitions, etc.), or arrange for them to be defined.

    All that done, you can just add the following in your flex file (it needs to be in the file in order to have access to the start condition definitions):

    NOTE: This is the C version. If you remove all actions, then doing this test in C should be fine, but if you really want to use C++ then you'll need to make some minor adjustments.

    int main(void) {
    #define X(n) { n, #n }
      struct { int state; const char* name; } states[] = {
        X(INITIAL), X(parm), X(brace), X(bracket), X(fatone), X(literal),
        { -1, 0 }
      };
    #undef X
      for (int i = 0; states[i].state >= 0; ++i) {
        int start = states[i].start;
        char buf[3] = {0, 0, 0};
        BEGIN(start);
        yy_scan_buffer(buf, 2);
        if (yylex() == 2)
          fprintf(stderr, "Failed to match EOF in state %s\n",
                  states[i].name);
        for (int ch = 0; ch < 256; ++ch) {
          buf[0] = ch;
          BEGIN(start);
          yy_scan_buffer(buf, 3);
          if (yylex() == 2)
            fprintf(stderr, "Failed to match 0X%02X in state %s\n",
                    (unsigned)ch, states[i].name);
        }
      }
      return 0;
    }