Search code examples
c++boostboost-spirit-qi

recursive BNF rule using boost spirit


I'm trying to write a parser for the following BNF rules using boost spirit (Boost v1.64)
The rules are:

<numeric-literal>::= integer  
<type-name> ::= "in" | "out" | "in_out"  
<array-type-spec> ::= <type-spec> "[" [<numeric-literal>] "]"  
<tuple-type-spec> ::= "(" <type-spec> ("," <type-spec>)+ ")"
<type-spec> ::= <type-name> | <array-type-spec> | <tuple-type-spec>  

Below is my attempt, using boost::make_recursive_variant
It seems to work ok on the string in
But it fails on in[2].
Where is my mistake?
What would be an elegant solution?

namespace Ast {
enum class TypeName { IN, OUT, INOUT};
using NumericLiteral = int;
    using TypeSpec = boost::make_recursive_variant
    <
    TypeName,
    std::pair<boost::recursive_variant_, NumericLiteral>,
    std::vector < boost::recursive_variant_ >
    >::type;
}
//grammar:
namespace myGrammar {
namespace qi = boost::spirit::qi;

template <typename Iterator = char const*,typename Signature = Ast::TypeSpec()>
struct myRules : qi::grammar < Iterator, Signature> {

    myRules() : myRules::base_type(start) {
        fillSymbols();
        rNumericLiteral = qi::int_;
        rTypeName = sTypeName;
        rTypeSpec = rTypeName | (rTypeSpec >> '[' >> rNumericLiteral >> ']') | ('(' >> qi::repeat(2, qi::inf)[(rTypeSpec % ',')] >> ')');

        start = qi::skip(qi::space)[rTypeSpec];
    }

private:
    using Skipper = qi::space_type;
    qi::rule<Iterator,  Ast::TypeSpec()> start;
    qi::rule<Iterator, Ast::NumericLiteral(), Skipper> rNumericLiteral;

    qi::rule<Iterator, Ast::TypeName(), Skipper> rTypeName;
    qi::rule<Iterator, Ast::TypeSpec(), Skipper> rTypeSpec;


    //symbols
    qi::symbols<char, Ast::TypeName>sTypeName;
    void fillSymbols()
    {
        using namespace Ast;
        sTypeName.add
            ("in", TypeName::IN)
            ("out", TypeName::OUT)
            ("in_out", TypeName::INOUT)
    }

};
}

Solution

  • There's a problem translating this grammar 1:1 to a PEG grammar since left-recursion leads to infinite recursion.

    You can still trivially rearrange the rules so left-recursion doesn't occur, but you will have more trouble synthesizing the AST you want.

    Here's a halfway station that has half-decent test results:

    Live On Coliru

    //#define BOOST_SPIRIT_DEBUG
    #include <boost/spirit/include/qi.hpp>
    #include <boost/fusion/adapted/std_pair.hpp>
    
    /*
        <numeric-literal> ::= integer
        <type-name>       ::= "in" | "out" | "in_out"
        <array-type-spec> ::= <type-spec> "[" [<numeric-literal>] "]"
        <tuple-type-spec> ::= "(" <type-spec> ("," <type-spec>)+ ")"
        <type-spec>       ::= <type-name> | <array-type-spec> | <tuple-type-spec>
    */
    
    namespace Ast {
        enum class TypeName { IN, OUT, INOUT };
    
        static inline std::ostream& operator<<(std::ostream& os, TypeName tn) {
            switch(tn) {
                case TypeName::IN:    return os << "IN";
                case TypeName::OUT:   return os << "OUT";
                case TypeName::INOUT: return os << "INOUT";
            }
            return os << "?";
        }
    
        using NumericLiteral = int;
    
        using TypeSpec = boost::make_recursive_variant<
            TypeName,
            std::pair<boost::recursive_variant_, NumericLiteral>,
            std::vector<boost::recursive_variant_>
        >::type;
    
        using ArraySpec = std::pair<TypeSpec, NumericLiteral>;
        using TupleSpec = std::vector<TypeSpec>;
    }
    
    // grammar:
    namespace myGrammar {
        namespace qi = boost::spirit::qi;
    
        template <typename Iterator = char const *, typename Signature = Ast::TypeSpec()>
            struct myRules : qi::grammar<Iterator, Signature> {
    
                myRules() : myRules::base_type(start) {
                    rNumericLiteral = qi::int_;
                    rTypeName       = sTypeName >> !qi::alpha;
                    rTupleSpec      = '(' >> rTypeSpec >> +(',' >> rTypeSpec) >> ')'; 
                    rScalarSpec     = rTypeName | rTupleSpec;
                    rArraySpec      = rScalarSpec >> '[' >> rNumericLiteral >> ']';
                    rTypeSpec       = rArraySpec | rScalarSpec;
    
                    start = qi::skip(qi::space)[rTypeSpec >> qi::eoi];
    
                    BOOST_SPIRIT_DEBUG_NODES((start)(rTypeSpec)(rTypeName)(rArraySpec)(rScalarSpec)(rTypeSpec)(rNumericLiteral))
                }
    
              private:
                using Skipper = qi::space_type;
                qi::rule<Iterator, Ast::TypeSpec()> start;
                qi::rule<Iterator, Ast::NumericLiteral(), Skipper> rNumericLiteral;
                qi::rule<Iterator, Ast::ArraySpec(), Skipper> rArraySpec;
                qi::rule<Iterator, Ast::TypeSpec(), Skipper> rTypeSpec, rScalarSpec;
                qi::rule<Iterator, Ast::TupleSpec(), Skipper> rTupleSpec;
                // implicit lexeme
                qi::rule<Iterator, Ast::TypeName()> rTypeName;
    
                // symbols
                struct TypeName_r : qi::symbols<char, Ast::TypeName> { 
                    TypeName_r() {
                        using Ast::TypeName;
                        add ("in", TypeName::IN)
                            ("out", TypeName::OUT)
                            ("in_out", TypeName::INOUT);
                    }
                } sTypeName;
            };
    }
    
    static inline std::ostream& operator<<(std::ostream& os, Ast::TypeSpec tn) {
        struct {
            std::ostream& _os;
    
            void operator()(Ast::TypeSpec const& ts) const {
                apply_visitor(*this, ts);
            }
            void operator()(Ast::TypeName tn) const { std::cout << tn; }
            void operator()(Ast::TupleSpec const& tss) const { 
                std::cout << "(";
                for (auto const& ts: tss) {
                    (*this)(ts); 
                    std::cout << ", ";
                }
                std::cout << ")";
            }
            void operator()(Ast::ArraySpec const& as) const { 
                (*this)(as.first);
                std::cout << '[' << as.second << ']';
            }
        } const dumper{os};
    
        dumper(tn);
        return os;
    }
    
    int main() {
        using It = std::string::const_iterator;
        myGrammar::myRules<It> const parser;
    
        std::string const test_ok[] = {
            "in",
            "out",
            "in_out",
            "(in, out)",
            "(out, in)",
            "(in, in, in, out, in_out)",
            "in[13]",
            "in[0]",
            "in[-2]",
            "in[1][2][3]",
            "in[3][3][3]",
            "(in[3][3][3], out, in_out[0])",
            "(in[3][3][3], out, in_out[0])",
            "(in, out)[13]",
            "(in, out)[13][0]",
        };
    
        std::string const test_fail[] = {
            "",
            "i n",
            "inout",
            "()",
            "(in)",
            "(out)",
            "(in_out)",
            "IN",
        };
    
        auto expect = [&](std::string const& sample, bool expected) {
            It f = sample.begin(), l = sample.end(); 
    
            Ast::TypeSpec spec;
            bool ok = parse(f, l, parser, spec);
    
            std::cout << "Test passed:" << std::boolalpha << (expected == ok) << "\n";
    
            if (expected || (expected != ok)) {
                if (ok) {
                    std::cout << "Parsed: " << spec << "\n";
                } else {
                    std::cout << "Parse failed\n";
                }
            }
    
            if (f!=l) {
                std::cout << "Remaining unparsed: '" << std::string(f,l) << "'\n";
            }
        };
    
        for (std::string const sample : test_ok)   expect(sample, true); 
        for (std::string const sample : test_fail) expect(sample, false); 
    }
    

    Prints

    Test passed:true
    Parsed: IN
    Test passed:true
    Parsed: OUT
    Test passed:true
    Parsed: INOUT
    Test passed:true
    Parsed: (IN, OUT, )
    Test passed:true
    Parsed: (OUT, IN, )
    Test passed:true
    Parsed: (IN, IN, IN, OUT, INOUT, )
    Test passed:true
    Parsed: IN[13]
    Test passed:true
    Parsed: IN[0]
    Test passed:true
    Parsed: IN[-2]
    Test passed:false
    Parse failed
    Remaining unparsed: 'in[1][2][3]'
    Test passed:false
    Parse failed
    Remaining unparsed: 'in[3][3][3]'
    Test passed:false
    Parse failed
    Remaining unparsed: '(in[3][3][3], out, in_out[0])'
    Test passed:false
    Parse failed
    Remaining unparsed: '(in[3][3][3], out, in_out[0])'
    Test passed:true
    Parsed: (IN, OUT, )[13]
    Test passed:false
    Parse failed
    Remaining unparsed: '(in, out)[13][0]'
    Test passed:true
    Test passed:true
    Remaining unparsed: 'i n'
    Test passed:true
    Remaining unparsed: 'inout'
    Test passed:true
    Remaining unparsed: '()'
    Test passed:true
    Remaining unparsed: '(in)'
    Test passed:true
    Remaining unparsed: '(out)'
    Test passed:true
    Remaining unparsed: '(in_out)'
    Test passed:true
    Remaining unparsed: 'IN'
    

    As you can see most things get parsed correctly, except for chained array dimensions like in[1][2]. The trouble is that we resolved ambiguity by inducing a "precedence" in the rules:

    rScalarSpec     = rTypeName | rTupleSpec;
    rArraySpec      = rScalarSpec >> '[' >> rNumericLiteral >> ']';
    rTypeSpec       = rArraySpec | rScalarSpec;
    

    This means we always try expecting an array dimension first, and only fallback to scalar type-spec if we failed to find one. This is because any array-spec would always be matched as a scalarspec first making it impossible to parse the array-dimension part.

    To fix the multi-dimensional case, you could try asserting that [ doesn't follow the array-spec:

        rArraySpec      = rScalarSpec >> '[' >> rNumericLiteral >> ']' >> !qi::lit('[')
                        | rArraySpec  >> '[' >> rNumericLiteral >> ']';
    

    But -- BOOM -- we're back at left-recursion again (in case we enter the second branch, e.g. in[1][).

    Back to the drawing board.

    Two thoughts cross my mind.

    1. I'd say it would be very beneficial to remove the distinction between scalar/array spec in the AST. If a scalar were to be treated as a zero-rank array that would just mean we could always parse an optional dimension into the same resulting AST type.

    2. The other thought more or less continues down the road shown above and would require backtracking all the way down if a presumed scalar spec was followed by a '[' character. This would lead to bad worst case behaviour in cases like (very long spec)[1][1][1][1][1][1][1][1][1][1].

    Let me implement the first idea outlined after a coffee break :)

    Reworked AST

    Here the TypeSpec always carries a (possibly empty) collection of dimensions:

    namespace Ast {
        enum class TypeName { IN, OUT, INOUT };
    
        static inline std::ostream& operator<<(std::ostream& os, TypeName tn) {
            switch(tn) {
                case TypeName::IN:    return os << "IN";
                case TypeName::OUT:   return os << "OUT";
                case TypeName::INOUT: return os << "INOUT";
            }
            return os << "?";
        }
    
        struct TypeSpec;
    
        using ScalarSpec = boost::make_recursive_variant<
            TypeName,
            std::vector<TypeSpec>
        >::type;
    
        struct TypeSpec {
            ScalarSpec            spec;
            std::vector<unsigned> dim;
        };
    
        using TupleSpec = std::vector<TypeSpec>;
    }
    

    Note that we also improved by making dimensions unsigned. The grammar will check that it's not 0 for completeness. A number of "positive" test cases have moved to the "expected-to-fail" cases for this reason.

    Now the grammar is a straightforward mimic of that:

    rRank      %= qi::uint_ [qi::_pass = (qi::_1 > 0)];
    rTypeName   = sTypeName;
    rTupleSpec  = '(' >> rTypeSpec >> +(',' >> rTypeSpec) >> ')'; 
    rScalarSpec = rTypeName | rTupleSpec;
    rTypeSpec   = rScalarSpec >> *('[' >> rRank >> ']');
    

    Note the semantic action using Phoenix to assert that the array dimension cannot be 0

    And here's the live demo showing all testcases passing:

    FULL DEMO

    Live On Coliru

    //#define BOOST_SPIRIT_DEBUG
    #include <boost/spirit/include/qi.hpp>
    #include <boost/spirit/include/phoenix.hpp>
    #include <boost/fusion/adapted.hpp>
    
    /*
        <numeric-literal> ::= integer
        <type-name>       ::= "in" | "out" | "in_out"
        <array-type-spec> ::= <type-spec> "[" [<numeric-literal>] "]"
        <tuple-type-spec> ::= "(" <type-spec> ("," <type-spec>)+ ")"
        <type-spec>       ::= <type-name> | <array-type-spec> | <tuple-type-spec>
    */
    
    namespace Ast {
        enum class TypeName { IN, OUT, INOUT };
    
        static inline std::ostream& operator<<(std::ostream& os, TypeName tn) {
            switch(tn) {
                case TypeName::IN:    return os << "IN";
                case TypeName::OUT:   return os << "OUT";
                case TypeName::INOUT: return os << "INOUT";
            }
            return os << "?";
        }
    
        struct TypeSpec;
    
        using ScalarSpec = boost::make_recursive_variant<
            TypeName,
            std::vector<TypeSpec>
        >::type;
    
        struct TypeSpec {
            ScalarSpec            spec;
            std::vector<unsigned> dim;
        };
    
        using TupleSpec = std::vector<TypeSpec>;
    }
    
    BOOST_FUSION_ADAPT_STRUCT(Ast::TypeSpec, spec, dim)
    
    // grammar:
    namespace myGrammar {
        namespace qi = boost::spirit::qi;
    
        template <typename Iterator = char const *, typename Signature = Ast::TypeSpec()>
            struct myRules : qi::grammar<Iterator, Signature> {
    
                myRules() : myRules::base_type(start) {
                    rRank      %= qi::uint_ [qi::_pass = (qi::_1 > 0)];
                    rTypeName   = sTypeName;
                    rTupleSpec  = '(' >> rTypeSpec >> +(',' >> rTypeSpec) >> ')'; 
                    rScalarSpec = rTypeName | rTupleSpec;
                    rTypeSpec   = rScalarSpec >> *('[' >> rRank >> ']');
    
                    start = qi::skip(qi::space)[rTypeSpec >> qi::eoi];
    
                    BOOST_SPIRIT_DEBUG_NODES((start)(rTypeSpec)(rTypeName)(rScalarSpec)(rTypeSpec)(rRank))
                }
    
              private:
                using Skipper = qi::space_type;
                qi::rule<Iterator, Ast::TypeSpec()> start;
                qi::rule<Iterator, Ast::ScalarSpec(), Skipper> rScalarSpec;
                qi::rule<Iterator, Ast::TypeSpec(),   Skipper> rTypeSpec;
                qi::rule<Iterator, Ast::TupleSpec(),  Skipper> rTupleSpec;
                // implicit lexeme
                qi::rule<Iterator, Ast::TypeName()> rTypeName;
                qi::rule<Iterator, unsigned()>      rRank;
    
                // symbols
                struct TypeName_r : qi::symbols<char, Ast::TypeName> { 
                    TypeName_r() {
                        using Ast::TypeName;
                        add ("in", TypeName::IN)
                            ("out", TypeName::OUT)
                            ("in_out", TypeName::INOUT);
                    }
                } sTypeName;
            };
    }
    
    static inline std::ostream& operator<<(std::ostream& os, Ast::TypeSpec tn) {
        struct {
            std::ostream& _os;
    
            void operator()(Ast::ScalarSpec const& ts) const {
                apply_visitor(*this, ts);
            }
            void operator()(Ast::TypeName tn) const { std::cout << tn; }
            void operator()(Ast::TupleSpec const& tss) const { 
                std::cout << "(";
                for (auto const& ts: tss) {
                    (*this)(ts); 
                    std::cout << ", ";
                }
                std::cout << ")";
            }
            void operator()(Ast::TypeSpec const& as) const { 
                (*this)(as.spec);
                for (auto rank : as.dim)
                    std::cout << '[' << rank << ']';
            }
        } const dumper{os};
    
        dumper(tn);
        return os;
    }
    
    int main() {
        using It = std::string::const_iterator;
        myGrammar::myRules<It> const parser;
    
        std::string const test_ok[] = {
            "in",
            "out",
            "in_out",
            "(in, out)",
            "(out, in)",
            "(in, in, in, out, in_out)",
            "in[13]",
            "in[1][2][3]",
            "in[3][3][3]",
            "(in[3][3][3], out, in_out[1])",
            "(in[3][3][3], out, in_out[1])",
            "(in, out)[13]",
            "(in, out)[13][14]",
        };
    
        std::string const test_fail[] = {
            "",
            "i n",
            "inout",
            "()",
            "(in)",
            "(out)",
            "(in_out)",
            "IN",
            "in[0]",
            "in[-2]",
            "(in[3][3][3], out, in_out[0])",
            "(in[3][3][3], out, in_out[0])",
        };
    
        auto expect = [&](std::string const& sample, bool expected) {
            It f = sample.begin(), l = sample.end(); 
    
            Ast::TypeSpec spec;
            bool ok = parse(f, l, parser, spec);
    
            std::cout << "Test passed:" << std::boolalpha << (expected == ok) << "\n";
    
            if (expected || (expected != ok)) {
                if (ok) {
                    std::cout << "Parsed: " << spec << "\n";
                } else {
                    std::cout << "Parse failed\n";
                }
            }
    
            if (f!=l) {
                std::cout << "Remaining unparsed: '" << std::string(f,l) << "'\n";
            }
        };
    
        for (std::string const sample : test_ok)   expect(sample, true); 
        for (std::string const sample : test_fail) expect(sample, false); 
    }
    

    Prints

    Test passed:true
    Parsed: IN
    Test passed:true
    Parsed: OUT
    Test passed:true
    Parsed: INOUT
    Test passed:true
    Parsed: (IN, OUT, )
    Test passed:true
    Parsed: (OUT, IN, )
    Test passed:true
    Parsed: (IN, IN, IN, OUT, INOUT, )
    Test passed:true
    Parsed: IN[13]
    Test passed:true
    Parsed: IN[1][2][3]
    Test passed:true
    Parsed: IN[3][3][3]
    Test passed:true
    Parsed: (IN[3][3][3], OUT, INOUT[1], )
    Test passed:true
    Parsed: (IN[3][3][3], OUT, INOUT[1], )
    Test passed:true
    Parsed: (IN, OUT, )[13]
    Test passed:true
    Parsed: (IN, OUT, )[13][14]
    Test passed:true
    Test passed:true
    Remaining unparsed: 'i n'
    Test passed:true
    Remaining unparsed: 'inout'
    Test passed:true
    Remaining unparsed: '()'
    Test passed:true
    Remaining unparsed: '(in)'
    Test passed:true
    Remaining unparsed: '(out)'
    Test passed:true
    Remaining unparsed: '(in_out)'
    Test passed:true
    Remaining unparsed: 'IN'
    Test passed:true
    Remaining unparsed: 'in[0]'
    Test passed:true
    Remaining unparsed: 'in[-2]'
    Test passed:true
    Remaining unparsed: '(in[3][3][3], out, in_out[0])'
    Test passed:true
    Remaining unparsed: '(in[3][3][3], out, in_out[0])'