Search code examples
c++flex-lexer

Flex error "end of buffer missed" for very specific inputs


I'm currently writing a parser for a compiler backend with Bison and Flex. Both tools are in C++ mode.

Everything seems to be working okay, except that flex sometimes outputs "fatal flex scanner internal error--end of buffer missed", but only for very specific inputs. If I add/ change/ remove even one chracter from the input file, the scanner works as expected.

Here's my scanner

%{
#include <string>

#include "lexer.hpp"
using Token = yy::Parser::token;

#define yyterminate() return( Token::YYEOF )

#define YY_USER_ACTION yylloc->step(); yylloc->columns(yyleng);
#define YY_VERBOSE

%}

/* Target the C++ implementation */
%option c++
/* Leave buffer-switching to us */
%option noyywrap 
/* Don't generate a default rule on our behalf */
%option nodefault
/* Don't try to #include unistd */
%option nounistd

/* Don't try to push tokens back into the input stream */
%option noinput 
%option nounput
%option stack
%option verbose
%option noyylineno
%option noyyget_lineno yyset_lineno yyget_out yyset_out yyget_in yyset_in
%option warn
%option noyymore
%option ecs
%option align
%option read
/* We're not writing an interpreter */
%option never-interactive batch 
/* Write a source file, but not a header file */
%option outfile="lexer.cpp"

%{

int yyFlexLexer::yylex() { abort(); }

bool Lexer::setBuffer(std::string_view str) noexcept
{
    yy_buffer_state* state = static_cast<yy_buffer_state*>(yyalloc(sizeof(yy_buffer_state)));
    if (!state)
    {
        return (false);
    }
    memset(state, 0, sizeof(yy_buffer_state));

    state->yy_buf_size = (int) (str.size());
    state->yy_buf_pos = state->yy_ch_buf = const_cast<char*>(str.data());
    state->yy_is_our_buffer = 0;
    state->yy_input_file = NULL;
    state->yy_n_chars = state->yy_buf_size;
    state->yy_is_interactive = 0;
    state->yy_at_bol = 1;
    state->yy_fill_buffer = 0;
    state->yy_buffer_status = YY_BUFFER_NEW;

    yy_switch_to_buffer( state );

    return (true);
}

%}


/* ---- Named pattern fragments ------------------------------------------- */
newline     (\n)


%x POST_SIGIL

/* vvvv Comments not allowed beyond this point vvvvvvvvvvvvvvvvvvvvvvvvvvvv */
%%

%{
    yylloc->step();
%}

<POST_SIGIL>[a-zA-Z][a-zA-Z0-9_]* {
        yylval->emplace<IdentifierToken>(yytext);
        BEGIN(INITIAL);
        return Token::IDENTIFIER;
    }

<POST_SIGIL>. { throw yy::Parser::syntax_error(*yylloc, "invalid character: " + std::string(yytext)); }

<INITIAL>{
"export"    return Token::EXPORT;
"thread"    return Token::THREAD;
"section"   return Token::SECTION;

"env"   return Token::ENV;
"phi"   return Token::PHI;

"type"  return Token::TYPE;
"align" return Token::ALIGN;

"data"  return Token::DATA;

"$"         BEGIN(POST_SIGIL); return Token::DOLLAR;
","         return Token::COMMA;
"..."       return Token::ELLIPSIS;
"@"         BEGIN(POST_SIGIL); return Token::AT;
"%"         BEGIN(POST_SIGIL); return Token::PERCENT;
"function"  return Token::FUNCTION;
"("         return Token::LPAREN;
")"         return Token::RPAREN;
"{"         return Token::LBRACE;
"}"         return Token::RBRACE;
"\+"        return Token::PLUS;
":"         BEGIN(POST_SIGIL); return Token::COLON;

"sb"        return Token::SB;
"ub"        return Token::UB;
"sh"        return Token::SH;
"uh"        return Token::UH;

"jmp"       return Token::JMP;
"jnz"       return Token::JNZ;
"hlt"       return Token::HLT;
"ret"       return Token::RET;
"call"      return Token::CALL;
"cast"      return Token::CAST;
"copy"      return Token::COPY;

"="         return Token::EQUALS;

"w" return Token::W;
"l" return Token::L;
"s" return Token::S;
"d" return Token::D;
"b" return Token::B;
"h" return Token::H;
"z" return Token::Z;

store(d|s|l|w|h|b) {
    yylval->emplace<StoreToken>(yytext[5]);
    return Token::STORE; 
}

load {
    yylval->emplace<LoadToken>("d"); //FIXME
    return Token::LOAD; 
}

load(d|s|l|w|h|b|sw|uw|sh|uh|sb|ub) { 
    yylval->emplace<LoadToken>(yytext + 4); 
    return Token::LOAD; 
}

"blit" return Token::BLIT;

alloc(4|8|16) { 
    yylval->emplace<AllocToken>(static_cast<int8_t>(std::stoi(yytext + 5))); 
    return Token::ALLOC; 
}

c(sle|slt|sge|sgt|ule|ult|uge|ugt)(w|l|s|d|q|t|h|f) {
    yylval->emplace<CompareToken>(std::string{yytext + 1, yytext + 4}, yytext + 4);
    return Token::COMPARE; 
}

c(eq|ne|le|lt|ge|gt|uo)(w|l|s|d|q|t|h|f) {
    yylval->emplace<CompareToken>(std::string{yytext + 1, yytext + 3}, yytext + 3);
    return Token::COMPARE; 
}

co(w|l|s|d|q|t|h|f) {
    yylval->emplace<CompareToken>("o", yytext + 2);
    return Token::COMPARE; 
}

(extsw|extuw|extsh|extuh|extsb|extub|exts|truncd|stosi|stoui|dtosi|dtoui|swtof|uwtof|sltof|ultof) {
    yylval->emplace<ConversionToken>();
    return Token::CONVERSION;
}

(add|and|div|mul|neg|or|rem|sar|shl|shr|sub|udix|urem|xor) {
    yylval->emplace<BinaryToken>();
    return Token::BINARY;
}

[-]?[0-9]+ {
    yylval->emplace<uint64_t>(std::stoll(yytext));
    return Token::NUMBER;
}

(s_|d_)?[-]?[0-9]*\.[0-9]+ {
    if(yytext[0] == 's' || yytext[0] == 'd')
        yylval->emplace<double>(std::stod(yytext + 2));
    else
        yylval->emplace<double>(std::stod(yytext));

    return Token::FLOAT;
}

\"[^\"]*\" {
    yylval->emplace<std::string>(yytext + 1, yyleng - 2);
    return Token::STRING_LITERAL;
}

"#".*       yylloc->step();
[ \t]+      yylloc->step();
{newline}+  yylloc->lines (yyleng); yylloc->step();

[a-zA-Z][a-zA-Z0-9_]* {
    yylval->emplace<IdentifierToken>(yytext + 1); 
    return Token::IDENTIFIER;
}

.   { throw yy::Parser::syntax_error(*yylloc, "invalid character: " + std::string(yytext)); }

<<EOF>> return Token::YYEOF;
}

And here's one example input file for which I get the error (note the empty line at the end):

# tests that the address matcher is not
# confused by the two multiplications

# note: the code handling apple asm fixes
# ruins the good work of the matcher here,
# I should revisit these fixes

export function w $f(l %i, l %j) {
@start
    %off1 =l mul %i, 8
    %a_i =l add $a, %off1
    %off2 =l mul %j, 4
    %a_ij =l add %a_i, %off2
    %x =w loadsw %a_ij
    ret %x
}

# >>> driver
# int a[] = {1, 2, 3, 4};
# extern int f(long long, long long);
# int main() {
#   return !(f(0, 0) == 1 && f(0, 1) == 2 && f(1, 0) == 3 && f(1, 1) == 4);
# }
# <<<


Solution

  • The problem was the Lexer::setBuffer() function. After reading the Flex manual again, I saw that the generated lexer class already has a switch_streams function. If I use that instead of setBuffer everything works as expected. I also don't get any ASAN errors.

    See also this GitHub discussion.