Search code examples
boost-spirit-qi

Boost Spirit Qi grammar adding to list inside skipper


Parsing these strings:

int main(){
    for (const std::string input: std::vector<std::string> { 
            "module simple_in_n_out();endmodule;",
            "module simple_in_n_out(in_1);endmodule;",
            "module simple_in_n_out(in_1,in_2,in_3);endmodule;",
            })
    {
        parse_verilog_file(input);
    }
    return 0;
}

Succeeds on first two inputs and push_back of first string, but fails when adding more strings to the vector:

            std::string module_name;
            stringvec module_inputs;

            module_input_list %= tok.identifier[push_back(phoenix::ref(module_inputs), _1)] % qi::lit(',');
            module_input_list.name("module_input_list");
            BOOST_SPIRIT_DEBUG_NODE(module_input_list);
            module_stmt
                %=   tok.module_ >> tok.identifier[phoenix::ref(module_name) = _1] 
                >> '(' >> -(module_input_list) >> ')'
                >> ';';
            module_stmt.name("module");
            BOOST_SPIRIT_DEBUG_NODE(module_stmt);

Output looks like:

<module_stmt>
  <try>[module]</try>
  <module_input_list>
    <try>[)][;][endmodule][;]</try>
    <fail/>
  </module_input_list>
  <success>[endmodule][;]</success>
  <attributes>[]</attributes>
</module_stmt>
<module_stmt>
  <try>[endmodule][;]</try>
  <fail/>
</module_stmt>
TODO: put the module together now
<module_stmt>
  <try></try>
  <fail/>
</module_stmt>
-------------------------
Parsing succeeded
-------------------------
module name: simple_in_n_out
<module_stmt>
  <try>[module]</try>
  <module_input_list>
    <try>[in_1][)][;][endmodule][;]</try>
    <success>[)][;][endmodule][;]</success>
    <attributes>[]</attributes>
  </module_input_list>
  <success>[endmodule][;]</success>
  <attributes>[]</attributes>
</module_stmt>
<module_stmt>
  <try>[endmodule][;]</try>
  <fail/>
</module_stmt>
TODO: put the module together now
<module_stmt>
  <try></try>
  <fail/>
</module_stmt>
-------------------------
Parsing succeeded
-------------------------
module name: simple_in_n_out
    module input: in_1
<module_stmt>
  <try>[module]</try>
  <module_input_list>
    <try>[in_1]</try>
    <success></success>
    <attributes>[]</attributes>
  </module_input_list>
  <fail/>
</module_stmt>
-------------------------
Parsing failed
-------------------------

Full code:

#define BOOST_SPIRIT_DEBUG
#include "netlist/netlistlexer.h"
namespace verilog {
    using namespace boost::spirit;
    using boost::phoenix::val;
    using boost::spirit::ascii::char_;
    using boost::spirit::ascii::string;

    ///////////////////////////////////////////////////////////////////////////////
    //  Grammar definition
    ///////////////////////////////////////////////////////////////////////////////
    template <typename Iterator, typename Lexer>
    struct verilog_grammar
    : qi::grammar<Iterator, qi::in_state_skipper<Lexer> >
    {
        template <typename TokenDef>
        verilog_grammar(TokenDef const& tok)
        : verilog_grammar::base_type(program)
        {
            using boost::spirit::_val;
            using phoenix::push_back;
            using qi::on_error;
            using qi::fail;
            using phoenix::construct;

            program
                =   +statement
                ;


            statement
                =   module_stmt
                |   end_module_stmt
                ;


            module_input_list %= tok.identifier[push_back(phoenix::ref(module_inputs), _1)] % qi::lit(',');
            module_input_list.name("module_input_list");
            BOOST_SPIRIT_DEBUG_NODE(module_input_list);
            module_stmt
                %=   tok.module_ >> tok.identifier[phoenix::ref(module_name) = _1] 
                >> '(' >> -(module_input_list) >> ')'
                >> ';';
            module_stmt.name("module");
            BOOST_SPIRIT_DEBUG_NODE(module_stmt);
            end_module_stmt
                =   (tok.endmodule_ >> ';' | tok.endmodule_)[
                    std::cout << val("TODO: put the module together now") << "\n"
                ];
            end_module_stmt.name("end_module_stmt");

            on_error<fail>
            (
                program
            , std::cout
                    << val("Error! Expecting ")
                    << _4                               // what failed?
                    << val(" here: \"")
                    << construct<std::string>(_3, _2)   // iterators to error-pos, end
                    << val("\"")
                    << std::endl
            );
        }

        std::string module_name;
        stringvec module_inputs;
        typedef boost::variant<unsigned int, std::string> expression_type;
        typedef boost::fusion::vector<std::string,std::vector<std::string>> fustring;

        qi::rule<Iterator, qi::in_state_skipper<Lexer> > program, statement;
        qi::rule<Iterator, qi::in_state_skipper<Lexer> > module_stmt;
        qi::rule<Iterator, qi::in_state_skipper<Lexer> > module_input_list;
        qi::rule<Iterator, qi::in_state_skipper<Lexer> > end_module_stmt;
    };
} // end verilog namespace

void parse_verilog_file(std::string str){
    typedef std::string::iterator base_iterator_type;
    using namespace boost::spirit;
    typedef lex::lexertl::token<
        base_iterator_type, boost::mpl::vector<unsigned int, std::string>
    > token_type;
     typedef lex::lexertl::lexer<token_type> lexer_type;
     typedef verilog::verilog_tokens<lexer_type> verilog_tokens;
     typedef verilog_tokens::iterator_type iterator_type;
     typedef verilog::verilog_grammar<iterator_type, verilog_tokens::lexer_def> verilog_grammar;
     verilog_tokens tokens;                         // Our lexer
     verilog_grammar calc(tokens);                  // Our parser

     std::string::iterator it = str.begin();
     iterator_type iter = tokens.begin(it, str.end());
     iterator_type end = tokens.end();
     bool r = qi::phrase_parse(iter, end, calc, qi::in_state("WS")[tokens.self]);

     if (r && iter == end)
     {
         std::cout << "-------------------------\n";
         std::cout << "Parsing succeeded\n";
         std::cout << "-------------------------\n";
         std::cout << "module name: " << calc.module_name << "\n";
         for (const std::string i: calc.module_inputs){
             std::cout << "    module input: " << i << "\n";
         }
     }
     else
     {
         std::cout << "-------------------------\n";
         std::cout << "Parsing failed\n";
         std::cout << "-------------------------\n";
     }

}

int main(){
    for (const std::string input: std::vector<std::string> { 
            "module simple_in_n_out();endmodule;",
            "module simple_in_n_out(in_1);endmodule;",
            "module simple_in_n_out(in_1,in_2,in_3);endmodule;",
            })
    {
        parse_verilog_file(input);
    }
    return 0;
}

netlist/netlistlexer.h:

#ifndef NETLISTLEXER_H
#define NETLISTLEXER_H
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_core.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_fusion.hpp>
#include <boost/spirit/include/phoenix_stl.hpp>
#include <boost/spirit/include/phoenix_object.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/variant/recursive_variant.hpp>
#include <boost/foreach.hpp>

#include <iostream>
#include <fstream>
#include <string>
#include <vector>
namespace fusion = boost::fusion;
namespace phoenix = boost::phoenix;
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
typedef std::vector<std::string> stringvec;
namespace verilog {
    using namespace boost::spirit;
    using boost::phoenix::val;
    using boost::spirit::ascii::char_;
    using boost::spirit::ascii::string;

    ///////////////////////////////////////////////////////////////////////////////
    //  Token definition
    ///////////////////////////////////////////////////////////////////////////////
    template <typename Lexer>
    struct verilog_tokens : lex::lexer<Lexer>
    {
        verilog_tokens()
        {
            // define the tokens to match
            identifier = "[a-zA-Z_][a-zA-Z0-9_]*";
            logic_op = "[\\&\\|]";
            constant = "[0-9]+";
            module_ = "module";
            assign_ = "assign";
            endmodule_ = "endmodule";
            wire_ = "wire";
            input_ = "input";
            output_ = "output";
            inout_ = "inout";
            reg_ = "reg";
            begin_ = "begin";
            end_ = "end";
            always_ = "always";
            if_ = "if";
            else_ = "else";
            parameter_ = "parameter";

            // associate the tokens and the token set with the lexer
            this->self = lex::token_def<>('(') | ')' | '{' | '}' | '=' | '[' | ']' | ';' | constant | logic_op;
            this->self += if_ | else_ | begin_ | end_ | always_ | reg_;
            this->self += module_ | endmodule_ | assign_ | wire_ | input_ | output_ | inout_;
            this->self += parameter_;
            this->self += identifier;

            // define the whitespace to ignore (spaces, tabs, newlines and C-style
            // comments)
            this->self("WS")
                =   lex::token_def<>("[ \\t\\n]+")
                |   "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/"
                |   "\\/\\/[^\\r\\n\\f]*"
                |   "\\(\\*[^*]*\\*\\)"
                ;
        }

        // these tokens have no attribute
        lex::token_def<lex::omit> if_, else_, begin_, end_, endmodule_;

        // these tokens expose the iterator_range of the matched input sequence
        lex::token_def<> always_, reg_;
        lex::token_def<> module_, assign_, wire_, input_, output_, inout_;
        lex::token_def<> parameter_;

        // The following two tokens have an associated attribute type, 'identifier'
        // carries a string (the identifier name) and 'constant' carries the
        // matched integer value.
        //
        // Note: any token attribute type explicitly specified in a token_def<>
        //       declaration needs to be listed during token type definition as
        //       well (see the typedef for the token_type below).
        //
        // The conversion of the matched input to an instance of this type occurs
        // once (on first access), which makes token attributes as efficient as
        // possible. Moreover, token instances are constructed once by the lexer
        // library. From this point on tokens are passed by reference only,
        // avoiding them being copied around.
        lex::token_def<std::string> identifier;
        lex::token_def<unsigned int> constant;
        lex::token_def<std::string> logic_op;
    };
} // end verilog namespace
#endif // NETLISTLEXER_H

Solution

  • Okay, I had to cut through the fog of Spirit Lex¹ and some quirks indicating that you might not be working with a standards-comforming compiler².

    When I did, I noticed that the actual grammar doesn't use attribute propagation, instead using ad-hoc Semantic Actions to extract some information³.

    I'm already on record that I think Spirit shines for quick prototyping when you find the sweet-spot. Manual AST building based on semantic actions is not where that spot lies IMO.

    As a final subtle clue I noticed that you "uselessly" include recursive_variant.hpp - which made me think you were actually hoping to use automatic attribute propagation with a recursive AST?


    First Ideas

    Let's use the module_stmt as an example. Instead of "abitrarily side-effecting" into the module_name and module_inputs parser member variables, let's use an AST type:

    namespace AST {
        using identifiers = stringvec;
    
        struct module {
            std::string name;
            identifiers inputs;
        };
    }
    

    Adapt it for automatic propagation:

    BOOST_FUSION_ADAPT_STRUCT(AST::module, name, inputs)
    

    And rely on that instead:

    module_input_list = tok.identifier % ',';
    module_stmt       
         = tok.module_ >> tok.identifier 
        >> '(' >> -module_input_list >> ')' >> ';'
        >> tok.endmodule_ >> (';' | qi::eoi)
        ;
    

    Note: I had to fix the module_ token definition to lex::omit

    Note how I included endmodule_ into the rule because that's the natural thing to do. Any nested (recursive) rules (like nested statements can just naturally go there and either synthesize into members of AST::module

    The rule declarations can be:

    qi::rule<Iterator, AST::module(),      Skipper> module_stmt;
    qi::rule<Iterator, AST::identifiers(), Skipper> module_input_list;
    

    Tying it together

    Of course, now the top-level rules have no attributes declared, so the AST::module instance that is magically synthesized just vanishes. That's unfortunate, but very easy to fix. Extending our AST types:

    namespace AST {
        using identifiers = stringvec;
    
        struct module {
            std::string name;
            identifiers inputs;
        };
    
        using statement = boost::make_recursive_variant<
            module // module_stmt
        >::type;
    
        using statements = std::vector<statement>;
    
        struct program {
            statements body;
        };
    }
    

    This rather simplistic take on a Verilog program will do. We extend the rules:

    qi::rule<Iterator, AST::program(),     Skipper> program;
    qi::rule<Iterator, AST::statements(),  Skipper> statements;
    qi::rule<Iterator, AST::statement(),   Skipper> statement;
    qi::rule<Iterator, AST::module(),      Skipper> module_stmt;
    qi::rule<Iterator, AST::identifiers(), Skipper> module_input_list;
    

    You will note the pattern of matching the rules to their corresponding AST nodes. The rules themselves don't change:

    program            = statements;
    statements         = +statement;
    statement          = module_stmt;
    
    module_input_list = tok.identifier % ',';
    module_stmt       
         = tok.module_ >> tok.identifier 
        >> '(' >> -module_input_list >> ')' >> ';'
        >> tok.endmodule_ >> (';' | qi::eoi)
        ;
    

    Note: I introduced the statements for consistency, and it also sidesteps a pitfall with propagating into single-element adapted fusion sequences⁴

    Now we can pass a AST::program attribute to the parser invocation:

    AST::program program;
    if (qi::parse(iter, end, calc, program)) {
        for (auto& stmt : program.body) {
            if (auto* module = boost::get<AST::module>(&stmt)) {
                std::cout << "module name: " << module->name << "\n";
                for (std::string const& i : module->inputs) {
                    std::cout << "    module input: " << i << "\n";
                }
            }
        }
    }
    

    Which will print the same output as expected:

    Live On Wandbox

    -------------------------
    module simple_in_n_out();endmodule;
    Parsing succeeded
    module name: simple_in_n_out
    -------------------------
    -------------------------
    module simple_in_n_out(in_1);endmodule;
    Parsing succeeded
    module name: simple_in_n_out
        module input: in_1
    -------------------------
    -------------------------
    module simple_in_n_out(in_1,in_2,in_3);endmodule;
    Parsing failed
    -------------------------
    

    Debugging the failure

    Uncommenting #define BOOST_SPIRIT_DEBUG shows what the problem is:

    Live On Wandbox

    <program>
      <try>[module]</try>
      <statements>
        <try>[module]</try>
        <statement>
          <try>[module]</try>
          <module_stmt>
            <try>[module]</try>
            <module_input_list>
              <try>[in_1]</try>
              <success></success>
              <attributes>[[[i, n, _, 1]]]</attributes>
            </module_input_list>
            <fail/>
          </module_stmt>
          <fail/>
        </statement>
        <fail/>
      </statements>
      <fail/>
    </program>
    

    The problem isn't with any of the rules! It's with ',' not matching. Quick glance at the tokens tells us why: there's no token that matches the comma... Adding it in a hurry:

    Live On Wandbox

    -------------------------
    module simple_in_n_out();endmodule;
    Parsing succeeded
    module name: simple_in_n_out
    -------------------------
    -------------------------
    module simple_in_n_out(in_1);endmodule;
    Parsing succeeded
    module name: simple_in_n_out
        module input: in_1
    -------------------------
    -------------------------
    module simple_in_n_out(in_1,in_2,in_3);endmodule;
    Parsing succeeded
    module name: simple_in_n_out
        module input: in_1
        module input: in_2
        module input: in_3
    -------------------------
    

    BONUS

    However, that "problem" kinda highlights another cost factor due to the lexer (notice the other issue with module_ token I papered over earlier). So here's the whole thing without the Lex overhead, without Phoenix overhead, in a fraction of the code, with full AST propagation:

    Live On Wandbox

    // #define BOOST_SPIRIT_DEBUG
    #include <boost/fusion/adapted.hpp>
    #include <boost/spirit/include/qi.hpp>
    #include <iomanip> // std::quoted
    namespace qi = boost::spirit::qi;
    
    namespace AST {
        using identifier = std::string;
        using identifiers = std::vector<identifier>;
    
        struct module {
            identifier name;
            identifiers inputs;
        };
    
        using statement = boost::variant<module>;
        using statements = std::vector<statement>;
    
        struct program {
            statements body;
        };
    }
    
    BOOST_FUSION_ADAPT_STRUCT(AST::module, name, inputs)
    BOOST_FUSION_ADAPT_STRUCT(AST::program, body)
    
    namespace verilog {
        template <typename Iterator> struct verilog_grammar : qi::grammar<Iterator, AST::program()> {
    
            verilog_grammar() : verilog_grammar::base_type(start) {
                auto kw = [](auto p) { return qi::copy(qi::lexeme[qi::no_case[p] >> !(qi::alnum|'_') ]); };
    
                start      = qi::skip(skipper.alias()) [ program ];
                program    = statements > qi::eoi;
                statements = -statement % ';';
                statement  = module_stmt.alias();
    
                module_input_list = identifier % ',';
                module_stmt       
                     = kw("module") >> identifier 
                    >> '(' >> -module_input_list >> ')' >> ';'
                    >> kw("endmodule")
                    ;
    
                // lexemes
                identifier = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z0-9_");
    
                skipper = qi::char_(" \t\r\n") // added \r for consistency
                    | "//" >> *~qi::char_("\r\n\f") 
                    | "/*" >> *(qi::char_ - "*/") >> "*/"
                    | "(*" >> *(qi::char_ - "*)") >> "*)"
                    ;
    
                BOOST_SPIRIT_DEBUG_NODES((program)(statements)(statement)(module_stmt)(module_input_list)(identifier));
            }
    
          private:
            using Skipper = qi::rule<Iterator>;
    
            qi::rule<Iterator, AST::program()> start;
            Skipper skipper;
    
            qi::rule<Iterator, AST::program(),     Skipper> program;
            qi::rule<Iterator, AST::statements(),  Skipper> statements;
            qi::rule<Iterator, AST::statement(),   Skipper> statement;
            qi::rule<Iterator, AST::module(),      Skipper> module_stmt;
            qi::rule<Iterator, AST::identifiers(), Skipper> module_input_list;
    
            // lexemes (formerly "tokens")
            qi::rule<Iterator, AST::identifier()> identifier;
        };
    } // end verilog namespace
    
    AST::program parse_verilog_file(std::string const& str) {
        typedef std::string::const_iterator iterator;
        static const verilog::verilog_grammar<iterator> grammar; // Our parser, now stateless
    
        try {
            AST::program program;
            parse(str.begin(), str.end(), grammar, program);
            return program;
        } catch(qi::expectation_failure<iterator> const& ef) {
            std::ostringstream msg;
            msg << "Parsing failed: expected " << ef.what_ << " at " << std::quoted(std::string(ef.first, ef.last));
            throw std::runtime_error(msg.str());
        }
    }
    
    int main() {
        for (const std::string input : std::vector<std::string>{
                 "module simple_in_n_out();endmodule;",
                 "module simple_in_n_out(in_1);endmodule;",
                 "module simple_in_n_out(in_1,in_2,in_3);endmodule;",
                 "module a();endmodule",
                 "module a();endmodule;oops",
             })
        try {
            std::cout << "-------------------------\n";
            std::cout << std::quoted(input) << "\n";
    
            for (auto const& stmt : parse_verilog_file(input).body) {
                if (auto* module = boost::get<AST::module>(&stmt)) {
                    std::cout << "module name: " << module->name << "\n";
                    for (std::string const& i : module->inputs) {
                        std::cout << "    module input: " << i << "\n";
                    }
                }
            }
        } catch(std::exception const& e) {
            std::cout << e.what() << '\n';
        }
    }
    

    Printing

    -------------------------
    "module simple_in_n_out();endmodule;"
    module name: simple_in_n_out
    -------------------------
    "module simple_in_n_out(in_1);endmodule;"
    module name: simple_in_n_out
        module input: in_1
    -------------------------
    "module simple_in_n_out(in_1,in_2,in_3);endmodule;"
    module name: simple_in_n_out
        module input: in_1
        module input: in_2
        module input: in_3
    -------------------------
    "module a();endmodule"
    module name: a
    -------------------------
    "module a();endmodule;oops"
    Parsing failed: expected <eoi> at "oops"
    

    Notable improvements:

    • Correct parsing of "keyword boundaries" (that's what the kw() helper is about). This means that if you have an identifier that begins with something that could be a keyword, it will not be falsely tokenized as that keyword (something that would happen with the original Lex-based approach)
    • Keywords are trivially case-insensitive (qi::no_case[]) - just for the sake of demo
    • The skipper is so much simpler to specify, and readable at the same time
    • Something that I already did in the Lex-based versions in this answer: the skipper is encapsulated in the grammar now. I'm of the opinion that a skipper should only be user-suppliable if the user may actually need to change it. In 99% of the cases the skipper is tightly coupled to the parser and using the wrong one would break the grammar regardless.

      As a bonus, invocation becomes much cleaner:

        AST::program program;
        parse(str.begin(), str.end(), grammar, program);
        return program;
    
    • Tieing into that notice how I simplified the parse_verilog_file function to ... be a function (returning the result), separating producing and handling the result
        AST::program parse_verilog_file(std::string const& str) {
            typedef std::string::const_iterator iterator;
            static const verilog::verilog_grammar<iterator> grammar; // Our parser, now stateless
    
            try {
                AST::program program;
                parse(str.begin(), str.end(), grammar, program);
                return program;
            } catch(qi::expectation_failure<iterator> const& ef) {
                std::ostringstream msg;
                msg << "Parsing failed: expected " << ef.what_ << " at " << std::quoted(std::string(ef.first, ef.last));
                throw std::runtime_error(msg.str());
            }
        }
    
    • That in turn shows a simplified error-handling by just catching the exception

    • Which in turn I use to replace the iter!=end check by just an other expectation point:

        program    = statements > qi::eoi;
    

    This, in combination with

        statements = -statement % ';';
    

    Makes it so that ';`` is required between statements, but not at the end of the program (which I _guess_ is what you wanted to convey with the oldendmodule` rule)

    Notice as well that -statement % ';' makes it so that empty statements are acceptable. If that's not what you wanted, drop the '-

    Note that the added test cases test and demonstrate the error detection/reporting for this logic ("module a();endmodule;oops" results in Parsing failed: expected <eoi> at "oops")

    • Any "tokens" like "identifier" are now "lexeme" rules, in that they do not obey the skipper⁵ The debug support now seemlessly includes those tokens the way you'd expect: Live On Wandbox

      <module_input_list>
        <try>in_1,in_2,in_3);endm</try>
        <identifier>
          <try>in_1,in_2,in_3);endm</try>
          <success>,in_2,in_3);endmodul</success>
          <attributes>[[i, n, _, 1]]</attributes>
        </identifier>
        <identifier>
          <try>in_2,in_3);endmodule</try>
          <success>,in_3);endmodule;oop</success>
          <attributes>[[i, n, _, 2]]</attributes>
        </identifier>
        <identifier>
          <try>in_3);endmodule;oops</try>
          <success>);endmodule;oops</success>
          <attributes>[[i, n, _, 3]]</attributes>
        </identifier>
        <success>);endmodule;oops</success>
        <attributes>[[[i, n, _, 1], [i, n, _, 2], [i, n, _, 3]]]</attributes>
      </module_input_list>
      
    • Oh, the code is significantly shorter while doing more: down from 211 to 112 lines of code (-47%)

    • It compiles significantly faster (12.1s down from 19.7s on my system)
    • Oh, given the current features, it can be further simplified: This clocks in at 90 LoC. However I would instead encourage at improving the features of the grammar, look e.g. here

    ¹ Anecdotally: "nobody uses that anymore". I'm not saying that because I don't know it (see), and I'm not alone. From this 2017 answer:

    using Lex makes most of the sweet-spot disappear since all "highlevel" parsers (like real_parser, [u]int_parser) are out the window. The Spirit devs are on record they prefer not to use Lex. Moreover, Spirit X3 doesn't have Lex support anymore.

    ² My guess MSVC, not fully up-to-date? The main culprit being ambiguous names because you uses using namespace.

    ³ Boost Spirit: "Semantic actions are evil"?

    ⁴ see many answers on SO: https://stackoverflow.com/search?q=user%3A85371+spirit+single-element

    ⁵ see perhaps Boost spirit skipper issues for my go-to description of how skippers, rule declarations and lexemes interact