Search code examples
c++boost-spirit

Access violation in boost::spirit::lex


I've reduced my code to the absolute minimum needed to reproduce the error (sadly that is still 60 lines, not quite Minimal, but its VCE at least).

I'm using Boost 1.56 in Visual Studio 2013 (Platform Toolset v120).

The code below gives me an Access Violation unless I uncomment the marked lines. By doing some tests, it seems boost::spirit doesn't like it if the enum starts at 0 (in my full code I have more values in the enum and I just set IntLiteral = 1 and it also got rid of the access violation error, although the names were wrong because ToString was off by one when indexing into the array).

Is this a bug in boost::spirit or did I do something wrong?

#include <iostream>
#include <string>
#include <vector>

#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>

namespace lex = boost::spirit::lex;

typedef lex::lexertl::token<char const*> LexToken;
typedef lex::lexertl::actor_lexer<LexToken> LexerType;
typedef boost::iterator_range<char const*> IteratorRange;

enum TokenType
{
    //Unused, // <-- Uncommenting this fixes the error (1/2)
    IntLiteral,
};

std::string tokenTypeNames[] = {
    //"unused", // <-- Uncommenting this line fixes the error (2/2)
    "int literal",
};

std::string ToString(TokenType t)
{
    return tokenTypeNames[t];
}

template <typename T>
struct Lexer : boost::spirit::lex::lexer < T >
{
    Lexer()
    {
        self.add
            // Literals
            ("[1-9][0-9]*", TokenType::IntLiteral);
    }
};

int main(int argc, char* argv[])
{
    std::cout << "Boost version: " << BOOST_LIB_VERSION << std::endl;
    std::string input = "33";

    char const* inputIt = input.c_str();
    char const* inputEnd = &input[input.size()];

    Lexer<LexerType> tokens;
    LexerType::iterator_type token = tokens.begin(inputIt, inputEnd);
    LexerType::iterator_type end = tokens.end();

    for (; token->is_valid() && token != end; ++token)
    {
        auto range = boost::get<IteratorRange>(token->value());
        std::cout << ToString(static_cast<TokenType>(token->id())) << " (" << std::string(range.begin(), range.end()) << ')' << std::endl;
    }

    std::cin.get();
    return 0;
}

If I uncomment the lines I get:

Boost version: 1_56
int literal (33)

Solution

  • The fact that it "works" if you uncomment those lines, is pure accident.

    From the docs spirit/lex/tutorials/lexer_quickstart2.html:

    To ensure every token gets assigned a id the Spirit.Lex library internally assigns unique numbers to the token definitions, starting with the constant defined by boost::spirit::lex::min_token_id

    See also this older answer:

    So you can just fix it using the offset, but I guess it will keep on being a brittle solution as it is very easy to let the enum go out of synch with actual token definitions in the lexer tables.

    I'd suggest using the nameof() approach as given in the linked answer, which leverages named token_def<> objects.

    Live On Coliru

    #include <iostream>
    #include <string>
    #include <vector>
    
    #include <boost/config/warning_disable.hpp>
    #include <boost/spirit/include/lex_lexertl.hpp>
    
    namespace lex = boost::spirit::lex;
    
    typedef lex::lexertl::token<char const*> LexToken;
    typedef lex::lexertl::actor_lexer<LexToken> LexerType;
    typedef boost::iterator_range<char const*> IteratorRange;
    
    enum TokenType {
        IntLiteral = boost::spirit::lex::min_token_id
    };
    
    std::string const& ToString(TokenType t) {
        static const std::string tokenTypeNames[] = {
            "int literal",
        };
    
        return tokenTypeNames[t - boost::spirit::lex::min_token_id];
    }
    
    template <typename T>
    struct Lexer : boost::spirit::lex::lexer<T> {
        Lexer() {
            this->self.add
                // Literals
                ("[1-9][0-9]*", TokenType::IntLiteral);
        }
    };
    
    int main() {
        std::cout << "Boost version: " << BOOST_LIB_VERSION << std::endl;
        std::string input = "33";
    
        char const* inputIt = input.c_str();
        char const* inputEnd = &input[input.size()];
    
        Lexer<LexerType> tokens;
        LexerType::iterator_type token = tokens.begin(inputIt, inputEnd);
        LexerType::iterator_type end = tokens.end();
    
        for (; token->is_valid() && token != end; ++token)
        {
            auto range = boost::get<IteratorRange>(token->value());
            std::cout << ToString(static_cast<TokenType>(token->id())) << " (" << std::string(range.begin(), range.end()) << ')' << std::endl;
        }
    }