Search code examples
c++boost-spiritboost-spirit-qi

Parse MemoryMapped files using Spirit quoted and normal


in according to the answer from sehe i want to parse quoted and normal content from a memory mapped file, as fast as possible.

The actual Parser looks like:

namespace qi = boost::spirit::qi;

using MatrixType =  std::vector<std::vector<boost::string_ref>>;
template<typename It>
struct parser : qi::grammar<It,  MatrixType(), qi::blank_type,  qi::locals<char> >
{
    parser()
    : parser::base_type( table, "parser" )
{
    using namespace boost::phoenix;
    using namespace qi;

    delimiter = ',';
    quoted =
        omit [ char_("'\"") [_a = _1] ]
        >> raw [ *(char_ - char_(_a)) ] [  _val = construct<boost::string_ref>(begin(_1), size(_1)) ]
        >> lit(_a);
     unquoted = raw[ *(char_ - (eol | delimiter) ) ] [ _val = construct<boost::string_ref>(begin(_1), size(_1))]; //raw [ *(char_ - char_("\"',")) ] [  _val = construct<boost::string_ref>(begin(_1), size(_1)) ];

    any_string = quoted | unquoted;
    line  = any_string  % delimiter;
    table = line % eol;

}

qi::rule<It, boost::string_ref() ,qi::locals<char> , qi::blank_type> any_string;
qi::rule<It, boost::string_ref() ,qi::locals<char> , qi::blank_type> quoted;
qi::rule<It, boost::string_ref() ,qi::locals<char> , qi::blank_type> unquoted;
qi::rule<It> delimiter;
 qi::rule<It, std::vector<boost::string_ref>(), qi::blank_type> line;
qi::rule<It, MatrixType(), qi::blank_type, qi::locals<char>> table;

};

example Inputfile:

"a","b",   "c", "d,e,f"
"a", -1, abc, 0.1

The actual parser add one , not existing empty line. There is no "\n" at the end of the file.


Solution

  • The problem is that end-of-line and end-of-input are implicitly delimiters.

    Since unquoted fields are allowed to be "empty" (zero-length), this will just parse one final line containing a single empty field.

    I'd suggest checking for end-of-input specifically:

    row = !eoi >> any_string % delimiter;
    

    Rows are rejected if there's nothing at all to be read. In order to be lenient and allow trailing empty lines, you could "eat" those:

    table = row % eol >> *eol;
    

    Finally, if you also want to "eat" empty lines in between table rows, simply add the repeat (kleene plus):

    table = row % +eol >> *eol;
    

    Demo Live On Coliru

    #define BOOST_SPIRIT_DEBUG
    #include <boost/utility/string_ref.hpp>
    #include <boost/spirit/include/qi.hpp>
    #include <boost/spirit/include/phoenix.hpp>
    
    namespace qi = boost::spirit::qi;
    
    using MatrixType = std::vector<std::vector<boost::string_ref>>;
    
    template<typename It>
    struct parser : qi::grammar<It, MatrixType(), qi::blank_type >
    {
        parser() : parser::base_type(table, "parser")
        {
            namespace px = boost::phoenix;
            using namespace qi;
    
            delimiter = ',';
            quoted    =
                   char_("'\"") [_a = _1] 
                >> raw [ *(char_ - char_(_a)) ] [ _val = px::construct<boost::string_ref>(px::begin(_1), px::size(_1)) ]
                >> lit(_a);
            unquoted   = raw[ *(char_ - (eol | delimiter) ) ] [ _val = px::construct<boost::string_ref>(px::begin(_1), px::size(_1))]; 
    
            any_string = quoted | unquoted;
            row        = !eoi >> any_string % delimiter;
            table      = row % +eol >> *eol;
    
            BOOST_SPIRIT_DEBUG_NODES((delimiter)(quoted)(unquoted)(any_string)(row)(table))
        }
    
      private:
        qi::rule<It, MatrixType(),             qi::blank_type> table;
        qi::rule<It, MatrixType::value_type(), qi::blank_type> row;
    
        // lexemes
        qi::rule<It, boost::string_ref(), qi::locals<char> > quoted;
        qi::rule<It, boost::string_ref()> any_string, unquoted;
        qi::rule<It> delimiter;
    };
    
    #include <fstream>
    #include <boost/iostreams/device/mapped_file.hpp>
    
    int main() {
        using It = const char*;
    
        boost::iostreams::mapped_file_source source("input.txt");
        It first = source.begin();
        It last  = source.end();
    
        parser<It> grammar;
        MatrixType data;
        bool ok = qi::phrase_parse(first, last, grammar, qi::blank, data);
    
        if (ok) {
            std::cout << "Parsed: \n";
            for (auto& row : data)
            {
                for (auto& cell : row)
                    std::cout << cell << "|";
                std::cout << "\n";
            }
        } else
        {
            std::cout << "Failed to parse\n";
        }
    
        if (first != last) {
            std::cout << "Remaining input unparsed: '" << std::string(first, last) << "'\n";
        }
    }
    

    Prints:

    Parsed: 
    a|b|c|d,e,f|
    a|-1|abc|0.1|
    a||abc|0.1|