Parsing heterogeneous data using Boost::Spirit

I'm trying to figure out how to approach the following problem.

I have a structure of the following format:

struct Data
     time_t timestamp;
     string id;
     boost::optional<int> data1;
     boost::optional<string> data2;
     // etc...

This should be parsed out of a single line string in the following format:

human_readable_timestamp;id;key1=value1 key2=value2.....

Of course the ordering of the keys does not have to match the order of elements in the structure.

Is Boost::Spirit suitable for this type of data? How do I approach this? I have gone through the examples, but I can't manage to get from the examples to code that fits my requirements.


    If you don't wish to use semantic actions (Boost Spirit: "Semantic actions are evil"?) you can slightly tweak the struct so that it matches the auto-synthesized attribute types when using the permutation for data elements:

    struct Data
        boost::posix_time::ptime timestamp;
        std::string id;
        struct Fields {
            boost::optional<int> data1;
            boost::optional<std::string> data2;
        } fields;

    Now the parser can just be:

        timestamp = stream;
        text  = lexeme [ '"' >> *~char_('"') >> '"' ];
        data1 = "key1" >> lit('=') >> int_;
        data2 = "key2" >> lit('=') >> text;
        id    = lexeme [ *~char_(';') ];
        start = timestamp >> ';' >> id >> ';' >> (data1 ^ data2);


    To the comment, made it "resilient". I ended up changing away from the permutation parser, and going with the first numbered approach (the Kleene star with semantic actions approach).

        id     = lexeme [ *~char_(';') ];
        auto data1 = bind(&Data::Fields::data1, _val);
        auto data2 = bind(&Data::Fields::data2, _val);
        other  = lexeme [ +(graph-'=') ] >> '=' >> (real_|int_|text);
        fields = *(
                    ("key1" >> lit('=') >> int_) [ data1 = _1 ]
                  | ("key2" >> lit('=') >> text) [ data2 = _1 ]
                  | other
        start  = timestamp >> ';' >> id >> -(';' >> fields);

    This changes the following aspects:

    • in order to be able to skip "other" fields, I needed to come up with a reasonable grammar for "other" fields:

      other  = lexeme [ +(graph-'=') ] >> '=' >> (real_|int_|text);

      (allows a key consisting of anything non-whitespace except =, followed by the =, followed by either something numeric (eager), or text).

    • I've extended the notion of text to support popular quoting/escaping schemes:

      text   = lexeme [ 
                  '"' >> *('\\' >> char_ | ~char_('"')) >> '"'
                | "'" >> *('\\' >> char_ | ~char_("'")) >> "'"
                | *graph 
    • it allows the same key to be repeated (in which case it retains the last valid value seen).

    • If you wanted to disallow invalid values, replace >> int_ or >> text by > int_ or > text (the expectation parser).

    I've extended the test cases with some challenging cases:

        2015-Jan-26 00:00:00;id
        2015-Jan-26 14:59:24;id;key2="value"
        2015-Jan-26 14:59:24;id;key2="value" key1=42
        2015-Jan-26 14:59:24;id;key2="value" key1=42 something=awful __=4.74e-10 blarg;{blo;bloop='whatever \'ignor\'ed' key2="new} \"value\""
        2015-Jan-26 14:59:24.123;id;key1=42 key2="value" 

    And it now prints

    Parsing '2015-Jan-26 00:00:00;id'
    Parsing success
    2015-Jan-26 00:00:00    id
    data1: --
    data2: --
    Parsing '2015-Jan-26 14:59:24;id;key2="value"'
    Parsing success
    2015-Jan-26 14:59:24    id
    data1: --
    data2:  value
    Parsing '2015-Jan-26 14:59:24;id;key2="value" key1=42'
    Parsing success
    2015-Jan-26 14:59:24    id
    data1:  42
    data2:  value
    Parsing '2015-Jan-26 14:59:24;id;key2="value" key1=42 something=awful __=4.74e-10 blarg;{blo;bloop='whatever \'ignor\'ed' key2="new} \"value\""'
    Parsing success
    2015-Jan-26 14:59:24    id
    data1:  42
    data2:  new} "value"
    Parsing '2015-Jan-26 14:59:24.123;id;key1=42 key2="value" '
    Parsing success
    2015-Jan-26 14:59:24.123000 id
    data1:  42
    data2:  value

    Live On Coliru

    //#define BOOST_SPIRIT_DEBUG
    #include <boost/optional/optional_io.hpp>
    #include <boost/date_time/posix_time/posix_time.hpp>
    #include <boost/date_time/posix_time/posix_time_io.hpp>
    #include <boost/fusion/adapted/struct.hpp>
    #include <boost/spirit/include/qi.hpp>
    #include <boost/spirit/include/phoenix.hpp>
    namespace qi = boost::spirit::qi;
    namespace phx = boost::phoenix;
    struct Data
        boost::posix_time::ptime timestamp;
        std::string id;
        struct Fields {
            boost::optional<int> data1;
            boost::optional<std::string> data2;
        } fields;
            (boost::optional<int>, data1)
            (boost::optional<std::string>, data2)
            (boost::posix_time::ptime, timestamp)
            (std::string, id)
            (Data::Fields, fields)
    template <typename It, typename Skipper = qi::space_type>
    struct grammar : qi::grammar<It, Data(), Skipper> {
        grammar() : grammar::base_type(start) {
            using namespace qi;
            timestamp = stream;
            real_parser<double, strict_real_policies<double> > real_;
            text   = lexeme [ 
                        '"' >> *('\\' >> char_ | ~char_('"')) >> '"'
                      | "'" >> *('\\' >> char_ | ~char_("'")) >> "'"
                      | *graph 
            id     = lexeme [ *~char_(';') ];
            auto data1 = bind(&Data::Fields::data1, _val);
            auto data2 = bind(&Data::Fields::data2, _val);
            other  = lexeme [ +(graph-'=') ] >> '=' >> (real_|int_|text);
            fields = *(
                        ("key1" >> lit('=') >> int_) [ data1 = _1 ]
                      | ("key2" >> lit('=') >> text) [ data2 = _1 ]
                      | other
            start  = timestamp >> ';' >> id >> -(';' >> fields);
        qi::rule<It,                                 Skipper> other;
        qi::rule<It, std::string(),                  Skipper> text, id;
        qi::rule<It, boost::posix_time::ptime(),     Skipper> timestamp;
        qi::rule<It, Data::Fields(),                 Skipper> fields;
        qi::rule<It, Data(),                         Skipper> start;
    int main() {
        using It = std::string::const_iterator;
        for (std::string const input : {
                "2015-Jan-26 00:00:00;id",
                "2015-Jan-26 14:59:24;id;key2=\"value\"",
                "2015-Jan-26 14:59:24;id;key2=\"value\" key1=42",
                "2015-Jan-26 14:59:24;id;key2=\"value\" key1=42 something=awful __=4.74e-10 blarg;{blo;bloop='whatever \\'ignor\\'ed' key2=\"new} \\\"value\\\"\"",
                "2015-Jan-26 14:59:24.123;id;key1=42 key2=\"value\" ",
            std::cout << "----------------------------------------\nParsing '" << input << "'\n";
            It f(input.begin()), l(input.end());
            Data parsed;
            bool ok = qi::phrase_parse(f,l,grammar<It>(),qi::space,parsed);
            if (ok) {
                std::cout << "Parsing success\n";
                std::cout << parsed.timestamp << "\t" << << "\n";
                std::cout << "data1: " << parsed.fields.data1 << "\n";
                std::cout << "data2: " << parsed.fields.data2 << "\n";
            } else {
                std::cout << "Parsing failed\n";
            if (f!=l)
                std::cout << "Remaining unparsed: '" << std::string(f,l) << "'\n";