Search code examples
c++parsingboostboost-spirit-qi

Followup: Using boost::spirit::qi to parse numbers with separators


This is a followup question to Using boost::spirit::qi to parse numbers with separators.


Following sehe's very good suggestions, I managed to get number parsing to work. I then attempted to update it to have a secondary parser which handled numbers with an optional sign. This second attempt failed. I suspect I have dome something incorrect with respect to how to handle a sub-grammar. Code is as follows:

#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>

namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
namespace phoenix = boost::phoenix;

template <typename Iterator, typename Num>
struct unsigned_parser : qi::grammar<Iterator, Num()> {
    unsigned_parser() : unsigned_parser::base_type(start) {
        using qi::_val;
        using qi::_1;
        using qi::eps;
        using qi::debug;
        using ascii::char_;

        bin = eps[_val=0] >> *(char_("01")[_val = _val * 2 + dval(_1)] | '_');
        oct = eps[_val=0] >> *(char_("0-7")[_val = _val * 8 + dval(_1)] | '_');
        dec = eps[_val=0]
              >> *(char_("0-9")[_val = _val * 10 + dval(_1)] | '_');
        hex = eps[_val=0]
              >> *(char_("0-9a-fA-F")[_val = _val * 16 + dval(_1)] | '_');
        start = (char_('0') >>
                 ((char_("xXhH") >> hex[_val=_1])
                  | (char_("bByY") >> bin[_val=_1])
                  | (char_("oOqQ") >> oct[_val=_1])
                  | (char_("dDtT") >> dec[_val=_1])))
                | (hex[_val=_1] >> char_("xXhH"))
                | (bin[_val=_1] >> char_("bByY"))
                | (oct[_val=_1] >> char_("oOqQ"))
                | (dec[_val=_1] >> -char_("dDtT"));
        start.name("unum");
        hex.name("hex");
        oct.name("oct");
        dec.name("dec");
        bin.name("bin");

        debug(start);
        debug(hex);
        debug(oct);
        debug(dec);
        debug(bin);
    }
    qi::rule<Iterator, Num()> start;
    qi::rule<Iterator, Num()> hex;
    qi::rule<Iterator, Num()> oct;
    qi::rule<Iterator, Num()> dec;
    qi::rule<Iterator, Num()> bin;
    struct _dval {
        template <typename> struct result { typedef uint8_t type; };
        template <typename T> uint8_t operator()(T ch) const {
            if (ch >= '0' || ch <= '9') {
                return ch - '0';
            }
            ch = std::tolower(ch);
            if (ch >= 'a' || ch <= 'f') {
                return ch - 'a' + 10;
            }
            assert(false);
        }
    };
    boost::phoenix::function<_dval> dval;
};

template <typename Iterator, typename Num>
struct signed_parser : qi::grammar<Iterator, Num()> {
    signed_parser() : signed_parser::base_type(start) {
        using qi::eps;
        using qi::_val;
        using qi::_1;
        using ascii::char_;
        using phoenix::static_cast_;
        unum = unsigned_parser<Iterator, Num>();
        start = (char_('-') >> unum[_val=-_1])
                | (-char_('+') >> unum[_val=_1]);
        unum.name("unum");
        start.name("snum");
        debug(start);
        /* debug(unum); */
    }
    qi::rule<Iterator, Num()> start;
    qi::rule<Iterator, Num()> unum;
};

int main(int argv, const char *argc[]) {
    using phoenix::ref;
    using qi::eoi;
    using qi::_1;

    typedef std::string::const_iterator iter;
    signed_parser<iter, int64_t> sp;
    int64_t val;
    if (argv != 2) {
        std::cerr << "Usage: " << argc[0] << " <input>" << std::endl;
        return 1;
    }
    std::string test(argc[1]);
    iter i = test.begin();
    iter end = test.end();
    bool rv = phrase_parse(i, end, sp[ref(val)=_1] >> eoi, ascii::space);
    if (rv) {
        assert(i == end);
        std::cout << "Succeeded: " << val << std::endl;
        return 0;
    }
    std::cout << "Failed." << std::endl;
    return 1;
}

With the signed_parser, every parse fails. Moreover, if I uncomment the commented-out debug(), the program segfaults.

I feel as if I am close to beginning to understand how to use this, so any help would be appreciated.


Solution

  • Using all those separate rules kills the opportunity for the compiler to optimize the parsing.

    You cannot refer to a temporary grammar/rule. You need to have the grammar instance around:

    template <typename Iterator, typename Num>
    struct signed_parser : qi::grammar<Iterator, Num()> {
        signed_parser() : signed_parser::base_type(snum) {
            using namespace qi;
    
            snum = lit('-') >> unum
                | -lit('+') >> unum
                ;
    
            BOOST_SPIRIT_DEBUG_NODES((snum))
        }
    private:
        qi::rule<Iterator, Num()> snum;
        unsigned_parser<Iterator, Num> unum;
    };
    

    Here's some cleanup for you:

    • swap argc and argv will you :)
    • use BOOST_SPIRIT_DEBUG* macros

      BOOST_SPIRIT_DEBUG_NODES((unum) (hex) (oct) (dec) (bin));
      
    • use bare literals instead if lit() or (worse!) char_()

    • prefer using automatic attribute propagation (Boost Spirit: "Semantic actions are evil"?). E.g. the rules can be much simpler:

          snum = lit('-') >> unum
              | -lit('+') >> unum
              ;
      
    • use %= to preserve automatic propagation in the presence of semantic actions:

          snum %= lit('-') >> unum [ _val = -_1 ]
               | -lit('+') >> unum
               ;
      
    • same thing goes for the phrase_parse call itself: you can pass bound references for the attributes. No need for semantic actions

    • doing tolower(ch) is likely slower (since you know it's ASCII), possibly incorrect (you get sign extension if your compiler has signed char)

    • UPDATE there was a rather gruesome bug in your dval actor. The range checks were wrong! Here's my fixed version:

      struct accum_f {
          template <typename...> struct result { typedef void type; };
          void operator()(char ch, Num& accum, int base) const {
              accum *= base;
      
              if      (ch >= '0' && ch <= '9') accum += ch - '0';
              else if (ch >= 'a' && ch <= 'f') accum += ch - 'a' + 10;
              else if (ch >= 'A' && ch <= 'F') accum += ch - 'A' + 10;
              else assert(false);
          }
      };
      boost::phoenix::function<accum_f> _accum;
      

      See below for consequential changes/simplifications to the semantic action

    • you can use the builting int_parser in the prefix branches; this could be (much) faster

    • caveat: when you write the unum semantic-action-less, it becomes essential that you don't "capture" the '0' with qi::char_ like you did. Otherwise, you'll be wondering why the result of any prefix-formatted number is always 48.

      unum = ('0' >>
                  ( (omit[ char_("xXhH") ] >> hex)
                  | (omit[ char_("bByY") ] >> bin)
                  | (omit[ char_("oOqQ") ] >> oct)
                  | (omit[ char_("dDtT") ] >> dec))
              )
          | (hex >> omit[  char_("xXhH") ])
          | (bin >> omit[  char_("bByY") ])
          | (oct >> omit[  char_("oOqQ") ])
          | (dec >> omit[ -char_("dDtT") ]);
      
    • using phrase_parse and a skipper has little effect as long as you're using parser expressions that don't use a skipper (see Boost spirit skipper issues)

    Live On Coliru

    //#define BOOST_SPIRIT_DEBUG
    #include <boost/spirit/include/qi.hpp>
    #include <boost/spirit/include/phoenix.hpp>
    
    namespace qi      = boost::spirit::qi;
    namespace ascii   = boost::spirit::ascii;
    
    template <typename Iterator, typename Num>
    struct unsigned_parser : qi::grammar<Iterator, Num()> {
        unsigned_parser() : unsigned_parser::base_type(unum) {
            using namespace qi;
    
            bin  = eps[_val=0] >> *(char_("01")        [ _accum(_1, _val, 2 )] | '_');
            oct  = eps[_val=0] >> *(char_("0-7")       [ _accum(_1, _val, 8 )] | '_');
            dec  = eps[_val=0] >> *(char_("0-9")       [ _accum(_1, _val, 10)] | '_');
            hex  = eps[_val=0] >> *(char_("0-9a-fA-F") [ _accum(_1, _val, 16)] | '_');
            unum = ('0' >>
                        ( (omit[ char_("xXhH") ] >> hex)
                        | (omit[ char_("bByY") ] >> bin)
                        | (omit[ char_("oOqQ") ] >> oct)
                        | (omit[ char_("dDtT") ] >> dec))
                    )
                | (hex >> omit[  char_("xXhH") ])
                | (bin >> omit[  char_("bByY") ])
                | (oct >> omit[  char_("oOqQ") ])
                | (dec >> omit[ -char_("dDtT") ]);
    
            BOOST_SPIRIT_DEBUG_NODES((unum) (hex) (oct) (dec) (bin));
        }
    
      private:
        qi::rule<Iterator, Num()> unum,  hex, oct, dec, bin;
    
        struct accum_f {
            template <typename...> struct result { typedef void type; };
            void operator()(char ch, Num& accum, int base) const {
                accum *= base;
    
                if      (ch >= '0' && ch <= '9') accum += ch - '0';
                else if (ch >= 'a' && ch <= 'f') accum += ch - 'a' + 10;
                else if (ch >= 'A' && ch <= 'F') accum += ch - 'A' + 10;
                else assert(false);
            }
        };
        boost::phoenix::function<accum_f> _accum;
    };
    
        template <typename Iterator, typename Num>
        struct signed_parser : qi::grammar<Iterator, Num()> {
            signed_parser() : signed_parser::base_type(snum) {
                using namespace qi;
    
                snum %= lit('-') >> unum [ _val = -_1 ]
                     | -lit('+') >> unum
                     ;
    
                BOOST_SPIRIT_DEBUG_NODES((snum))
            }
        private:
            qi::rule<Iterator, Num()> snum;
            unsigned_parser<Iterator, Num> unum;
        };
    
    int main(int argc, const char *argv[]) {
        typedef std::string::const_iterator iter;
        signed_parser<iter, int64_t> const sp;
    
        for (std::string const& s : boost::make_iterator_range(argv+1, argv+argc))
        {
            std::cout << "\n-----------------------------\nParsing '" << s << "':\n";
    
            int64_t val;
            iter i = s.begin(), end = s.end();
            bool rv = phrase_parse(i, end, sp >> qi::eoi, ascii::space, val);
    
            if (rv) {
                std::cout << "Succeeded: " << val << std::endl;
            } else {
                std::cout << "Failed." << std::endl;
            }
    
            if (i!=end) {
                std::cout << "Remaining unparsed: '" << std::string(i,end) << "'\n";
            }
        }
    }
    

    Output:

    -----------------------------
    Parsing '-124_456d':
    Succeeded: -124456
    
    -----------------------------
    Parsing '123_456D':
    Succeeded: 123456
    
    -----------------------------
    Parsing '-123_456T':
    Succeeded: -123456
    
    -----------------------------
    Parsing '123456t':
    Succeeded: 123456
    
    -----------------------------
    Parsing '+1_bh':
    Succeeded: 27
    
    -----------------------------
    Parsing '0_010Q':
    Succeeded: 8
    
    -----------------------------
    Parsing '+1010_1010_0111_0111_b':
    Succeeded: 43639
    
    -----------------------------
    Parsing '123_456':
    Succeeded: 123456
    
    -----------------------------
    Parsing '-123456':
    Succeeded: -123456
    
    -----------------------------
    Parsing '1_bh':
    Succeeded: 27
    
    -----------------------------
    Parsing '-0_010Q':
    Succeeded: -8
    
    -----------------------------
    Parsing '1010_1010_0111_0111_b':
    Succeeded: 43639
    
    -----------------------------
    Parsing '+0d124_456':
    Succeeded: 124456
    
    -----------------------------
    Parsing '0D123_456':
    Succeeded: 123456
    
    -----------------------------
    Parsing '+0T123_456':
    Succeeded: 123456
    
    -----------------------------
    Parsing '0t123456':
    Succeeded: 123456
    
    -----------------------------
    Parsing '0h1_b':
    Succeeded: 27
    
    -----------------------------
    Parsing '0Q0_010':
    Succeeded: 8
    
    -----------------------------
    Parsing '0b1010_1010_0111_0111_':
    Succeeded: 43639
    
    -----------------------------
    Parsing '06123_45':
    Succeeded: 612345
    
    -----------------------------
    Parsing '0612345':
    Succeeded: 612345
    
    -----------------------------
    Parsing '0h1_b':
    Succeeded: 27
    
    -----------------------------
    Parsing '-0Q0_010':
    Succeeded: -8
    
    -----------------------------
    Parsing '0b1010_1010_0111_0111_':
    Succeeded: 43639