Parsing heterogeneous data using Boost::Spirit

I'm trying to figure out how to approach the following problem.

I have a structure of the following format:

struct Data
{
     time_t timestamp;
     string id;
     boost::optional<int> data1;
     boost::optional<string> data2;
     // etc...
};

This should be parsed out of a single line string in the following format:

human_readable_timestamp;id;key1=value1 key2=value2.....

Of course the ordering of the keys does not have to match the order of elements in the structure.

Is Boost::Spirit suitable for this type of data? How do I approach this? I have gone through the examples, but I can't manage to get from the examples to code that fits my requirements.

Solution

You could use the permutation parser. I've made a very similar example here:

Reading JSON file with C++ and BOOST

If you have repeating keys, then it makes more sense to use a Kleene*, perhaps

with semantic actions to assign the attributes /or/
using attribute customization points to assign the result
PS. Also look at the keyword parser from Spirit Repository (Boost Qi Composing rules using Functions)

If you don't wish to use semantic actions (Boost Spirit: "Semantic actions are evil"?) you can slightly tweak the struct so that it matches the auto-synthesized attribute types when using the permutation for data elements:

struct Data
{
    boost::posix_time::ptime timestamp;
    std::string id;
    struct Fields {
        boost::optional<int> data1;
        boost::optional<std::string> data2;
    } fields;
};

Now the parser can just be:

    timestamp = stream;

    text  = lexeme [ '"' >> *~char_('"') >> '"' ];
    data1 = "key1" >> lit('=') >> int_;
    data2 = "key2" >> lit('=') >> text;
    id    = lexeme [ *~char_(';') ];

    start = timestamp >> ';' >> id >> ';' >> (data1 ^ data2);

UPDATE

To the comment, made it "resilient". I ended up changing away from the permutation parser, and going with the first numbered approach (the Kleene star with semantic actions approach).

    id     = lexeme [ *~char_(';') ];

    auto data1 = bind(&Data::Fields::data1, _val);
    auto data2 = bind(&Data::Fields::data2, _val);

    other  = lexeme [ +(graph-'=') ] >> '=' >> (real_|int_|text);

    fields = *(
                ("key1" >> lit('=') >> int_) [ data1 = _1 ]
              | ("key2" >> lit('=') >> text) [ data2 = _1 ]
              | other
              );

    start  = timestamp >> ';' >> id >> -(';' >> fields);

This changes the following aspects:

in order to be able to skip "other" fields, I needed to come up with a reasonable grammar for "other" fields:
```
other  = lexeme [ +(graph-'=') ] >> '=' >> (real_|int_|text);
```
(allows a key consisting of anything non-whitespace except =, followed by the =, followed by either something numeric (eager), or text).

I've extended the notion of text to support popular quoting/escaping schemes:

text   = lexeme [ 
            '"' >> *('\\' >> char_ | ~char_('"')) >> '"'
          | "'" >> *('\\' >> char_ | ~char_("'")) >> "'"
          | *graph 
       ];

it allows the same key to be repeated (in which case it retains the last valid value seen).
If you wanted to disallow invalid values, replace >> int_ or >> text by > int_ or > text (the expectation parser).

I've extended the test cases with some challenging cases:

    2015-Jan-26 00:00:00;id
    2015-Jan-26 14:59:24;id;key2="value"
    2015-Jan-26 14:59:24;id;key2="value" key1=42
    2015-Jan-26 14:59:24;id;key2="value" key1=42 something=awful __=4.74e-10 blarg;{blo;bloop='whatever \'ignor\'ed' key2="new} \"value\""
    2015-Jan-26 14:59:24.123;id;key1=42 key2="value"

And it now prints

----------------------------------------
Parsing '2015-Jan-26 00:00:00;id'
Parsing success
2015-Jan-26 00:00:00    id
data1: --
data2: --
----------------------------------------
Parsing '2015-Jan-26 14:59:24;id;key2="value"'
Parsing success
2015-Jan-26 14:59:24    id
data1: --
data2:  value
----------------------------------------
Parsing '2015-Jan-26 14:59:24;id;key2="value" key1=42'
Parsing success
2015-Jan-26 14:59:24    id
data1:  42
data2:  value
----------------------------------------
Parsing '2015-Jan-26 14:59:24;id;key2="value" key1=42 something=awful __=4.74e-10 blarg;{blo;bloop='whatever \'ignor\'ed' key2="new} \"value\""'
Parsing success
2015-Jan-26 14:59:24    id
data1:  42
data2:  new} "value"
----------------------------------------
Parsing '2015-Jan-26 14:59:24.123;id;key1=42 key2="value" '
Parsing success
2015-Jan-26 14:59:24.123000 id
data1:  42
data2:  value

Live On Coliru

//#define BOOST_SPIRIT_DEBUG
#include <boost/optional/optional_io.hpp>
#include <boost/date_time/posix_time/posix_time.hpp>
#include <boost/date_time/posix_time/posix_time_io.hpp>
#include <boost/fusion/adapted/struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>

namespace qi = boost::spirit::qi;
namespace phx = boost::phoenix;

struct Data
{
    boost::posix_time::ptime timestamp;
    std::string id;
    struct Fields {
        boost::optional<int> data1;
        boost::optional<std::string> data2;
    } fields;
};

BOOST_FUSION_ADAPT_STRUCT(Data::Fields,
        (boost::optional<int>, data1)
        (boost::optional<std::string>, data2)
    )

BOOST_FUSION_ADAPT_STRUCT(Data,
        (boost::posix_time::ptime, timestamp)
        (std::string, id)
        (Data::Fields, fields)
    )

template <typename It, typename Skipper = qi::space_type>
struct grammar : qi::grammar<It, Data(), Skipper> {
    grammar() : grammar::base_type(start) {
        using namespace qi;
        timestamp = stream;

        real_parser<double, strict_real_policies<double> > real_;

        text   = lexeme [ 
                    '"' >> *('\\' >> char_ | ~char_('"')) >> '"'
                  | "'" >> *('\\' >> char_ | ~char_("'")) >> "'"
                  | *graph 
               ];

        id     = lexeme [ *~char_(';') ];

        auto data1 = bind(&Data::Fields::data1, _val);
        auto data2 = bind(&Data::Fields::data2, _val);

        other  = lexeme [ +(graph-'=') ] >> '=' >> (real_|int_|text);

        fields = *(
                    ("key1" >> lit('=') >> int_) [ data1 = _1 ]
                  | ("key2" >> lit('=') >> text) [ data2 = _1 ]
                  | other
                  );

        start  = timestamp >> ';' >> id >> -(';' >> fields);

        BOOST_SPIRIT_DEBUG_NODES((timestamp)(id)(start)(text)(other)(fields))
    }
  private:
    qi::rule<It,                                 Skipper> other;
    qi::rule<It, std::string(),                  Skipper> text, id;
    qi::rule<It, boost::posix_time::ptime(),     Skipper> timestamp;
    qi::rule<It, Data::Fields(),                 Skipper> fields;
    qi::rule<It, Data(),                         Skipper> start;
};

int main() {
    using It = std::string::const_iterator;
    for (std::string const input : {
            "2015-Jan-26 00:00:00;id",
            "2015-Jan-26 14:59:24;id;key2=\"value\"",
            "2015-Jan-26 14:59:24;id;key2=\"value\" key1=42",
            "2015-Jan-26 14:59:24;id;key2=\"value\" key1=42 something=awful __=4.74e-10 blarg;{blo;bloop='whatever \\'ignor\\'ed' key2=\"new} \\\"value\\\"\"",
            "2015-Jan-26 14:59:24.123;id;key1=42 key2=\"value\" ",
            })
    {
        std::cout << "----------------------------------------\nParsing '" << input << "'\n";
        It f(input.begin()), l(input.end());
        Data parsed;
        bool ok = qi::phrase_parse(f,l,grammar<It>(),qi::space,parsed);

        if (ok) {
            std::cout << "Parsing success\n";
            std::cout << parsed.timestamp << "\t" << parsed.id << "\n";
            std::cout << "data1: " << parsed.fields.data1 << "\n";
            std::cout << "data2: " << parsed.fields.data2 << "\n";
        } else {
            std::cout << "Parsing failed\n";
        }

        if (f!=l)
            std::cout << "Remaining unparsed: '" << std::string(f,l) << "'\n";
    }
}