Search code examples
c++csvboostboost-spiritboost-spirit-qi

parse typed csv file with boost::spirit::qi


I want to parse a CSV-File with typed values. The type of every column is defined in the header, e.x.:

int double double int unsigned
12  1.3    23445  1   42
45  46     47     48  49

The result data structure may be something like this 2-dimensional vector:

using ColumnType = boost::variant<
  std::vector<int>,
  std::vector<unsigned>,
  std::vector<double>
>;

using ResultType = std::vector<ColumnType>;

My working code:

namespace phoenix = boost::phoenix;
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;

using ColumnType = boost::variant<
  std::vector<int>,
  std::vector<unsigned>,
  std::vector<double>
>;

using ResultType = std::vector<ColumnType>;

enum class CSVDataType
{
  Int, UInt, Double
};

template<typename Iterator>
struct TypedCSVGrammar: qi::grammar<Iterator, ResultType(), ascii::blank_type> {
  struct types_: qi::symbols<char, CSVDataType> {
    types_() {
      add
        ("int", CSVDataType::Int)
        ("unsigned", CSVDataType::UInt)
        ("double", CSVDataType::Double);
    }
  } types;

  TypedCSVGrammar() :
    TypedCSVGrammar::base_type(csv, "csv")
  {
    using ascii::string;
    using namespace qi::labels;

    header %= *(types);

    cell =
      (
        qi::eps(phoenix::ref(column_types)[_r1] == phoenix::val(CSVDataType::Int))
        >> qi::int_ [phoenix::bind(&TypedCSVGrammar::add_int, this, _r1, _1)]
      ) | (
        qi::eps(phoenix::ref(column_types)[_r1] == phoenix::val(CSVDataType::UInt))
        >> qi::uint_ [phoenix::bind(&TypedCSVGrammar::add_uint, this, _r1, _1)]
      ) | (
        qi::eps(phoenix::ref(column_types)[_r1] == phoenix::val(CSVDataType::Double))
        >> qi::double_ [phoenix::bind(&TypedCSVGrammar::add_double, this, _r1, _1)]
      );

    row =
      qi::eps [phoenix::ref(column) = phoenix::val(0)]
      >> qi::repeat(phoenix::size(phoenix::ref(column_types))) [
        cell(phoenix::ref(column))
        >> qi::eps [phoenix::ref(column)++]
      ];

    csv =
      header [phoenix::bind(&TypedCSVGrammar::construct_columns, this, _1)]
      > qi::eol
      > row % qi::eol
      > *qi::eol
      > qi::attr(result);

    qi::on_error<qi::fail>
    (
        csv
      , std::cout
            << phoenix::val("Error! Expecting ")
            << _4                               // what failed?
            << phoenix::val(" here: \"")
            << phoenix::construct<std::string>(_3, _2)   // iterators to error-pos, end
            << phoenix::val("\"")
            << std::endl
    );
  }

  void add_int(std::size_t c, int i) {
    boost::get<std::vector<int>>(result[c]).push_back(i);
  }    
  void add_uint(std::size_t c, unsigned i) {
    boost::get<std::vector<unsigned>>(result[c]).push_back(i);
  }    
  void add_double(std::size_t c, double i) {
    boost::get<std::vector<double>>(result[c]).push_back(i);
  }

  void construct_columns(const std::vector<CSVDataType>& columns) {
    column_types = columns;    
    for (const auto& c : columns) {
      switch (c) {
      case CSVDataType::Int:
        result.push_back(std::vector<int>());
        break;

      case CSVDataType::UInt:
        result.push_back(std::vector<unsigned>());
        break;

      case CSVDataType::Double:
        result.push_back(std::vector<double>());
        break;
      }
    }
  }

  std::vector<CSVDataType> column_types;
  std::size_t column;
  ResultType result;

  qi::rule<Iterator, ResultType(), ascii::blank_type> csv;    
  qi::rule<Iterator, std::vector<CSVDataType>(), ascii::blank_type> header;
  qi::rule<Iterator, void(std::size_t), ascii::blank_type> cell;
  qi::rule<Iterator, void(), ascii::blank_type> row;
};

Is there any better solution? I want to use more than just 3 types (maybe more than 10 types). This would be a lot of typing.


Solution

  • I don't see why you would come up with such a contrived target data structure. It seems to invite errors with unmatched indices.

    May I suggest a Nabialek Trick here.

    It works well if you change the AST around to:

    using ValueType = boost::variant<int, unsigned, double>;
    using ResultType = std::vector<std::vector<ValueType>>;
    

    (This seems like a more desirable approach anyways)

    In short, you translate the column types into a vector of parser rules (std::vector<dynamic>).

    Live On Coliru

    #define BOOST_SPIRIT_DEBUG
    #include <boost/spirit/include/qi.hpp>
    #include <boost/spirit/include/phoenix.hpp>
    
    namespace px    = boost::phoenix;
    namespace qi    = boost::spirit::qi;
    namespace ascii = boost::spirit::ascii;
    
    using ValueType = boost::variant<int, unsigned, double>;
    using ResultType = std::vector<std::vector<ValueType>>;
    
    enum class CSVDataType { Int, UInt, Double };
    
    namespace boost { namespace spirit { namespace qi { // FOR DEBUG
        template <typename... T> std::ostream& operator<<(std::ostream& os, rule<T...> const*)                     { return os << "(lazy rule)";       }
        template <typename... T> std::ostream& operator<<(std::ostream& os, std::vector<rule<T...> const*> const&) { return os << "(column mappings)"; }
    } } }
    
    template<typename Iterator, typename Skipper = ascii::blank_type>
    struct TypedCSVGrammar: qi::grammar<Iterator, ResultType(), Skipper> {
    
        TypedCSVGrammar() : TypedCSVGrammar::base_type(start, "csv")
        {
            using namespace qi::labels;
    
            header = *types;
    
            csv    = qi::omit[ header [ _cols = _1 ] ] > qi::eol
                   > qi::repeat(_current=0, px::size(_cols)) [ qi::lazy(*_cols[_current++]) ] % qi::eol
                   > *qi::eol
                   ;
    
            start = csv;
    
            BOOST_SPIRIT_DEBUG_NODES((start)(csv)(header));
    
            qi::on_error<qi::fail> (csv, px::ref(std::cout)
                        << "Error! Expecting " << _4                                  // what failed?
                        << " here: \""         << px::construct<std::string>(_3, _2)  // iterators to error-pos, end
                        << "\"\n"
                );
        }
    
      private:
        using cell_parser_t = qi::rule<Iterator, ValueType(), Skipper>;
        using dynamic       = cell_parser_t const*;
    
        struct types_: qi::symbols<char, dynamic> {
            cell_parser_t
                int_cell    = qi::int_,
                uint_cell   = qi::uint_,
                double_cell = qi::double_;
    
            types_() {
                this->add
                    ("int",      &int_cell)
                    ("unsigned", &uint_cell)
                    ("double",   &double_cell);
                BOOST_SPIRIT_DEBUG_NODES((int_cell)(uint_cell)(double_cell))
            }
        } types;
    
        using state = qi::locals<std::vector<dynamic>, size_t>;
        qi::_a_type _cols;
        qi::_b_type _current;
    
        qi::rule<Iterator, ResultType(),             Skipper> start;
        qi::rule<Iterator, std::vector<dynamic>(),   Skipper> header;
        qi::rule<Iterator, ResultType(),             Skipper, state>  csv;
    };
    
    int main() {
        using It = boost::spirit::istream_iterator;
    
        It f(std::cin >> std::noskipws), l;
        TypedCSVGrammar<It> g;
        ResultType data;
        bool ok = qi::phrase_parse(f, l, g, ascii::blank, data);
        if (ok) {
            std::cout << "Parse success\n";
    
            for(auto& row: data) {
                for(auto& cell: row) std::cout << cell << "\t";
                std::cout << "\n";
            }
        }
        else
            std::cout << "Parse failed\n";
    
        if (f!=l)
            std::cout << "Remaining unparsed: '" << std::string(f,l) << "'\n";
    }
    

    So for the input shown it prints

    Parse success
    12  1.3 23445   1   42  
    45  46  47  48  49  
    

    And the debug information if

    <start>
      <try>int double double in</try>
      <csv>
        <try>int double double in</try>
        <header>
          <try>int double double in</try>
          <success>\n12  1.3    23445  1</success>
          <attributes>[[(lazy rule), (lazy rule), (lazy rule), (lazy rule), (lazy rule)]]</attributes>
        </header>
        <int_cell>
          <try>12  1.3    23445  1 </try>
          <success>  1.3    23445  1   </success>
          <attributes>[12]</attributes>
        </int_cell>
        <double_cell>
          <try>  1.3    23445  1   </try>
          <success>    23445  1   42\n45</success>
          <attributes>[1.3]</attributes>
        </double_cell>
        <double_cell>
          <try>    23445  1   42\n45</try>
          <success>  1   42\n45  46     </success>
          <attributes>[23445]</attributes>
        </double_cell>
        <int_cell>
          <try>  1   42\n45  46     </try>
          <success>   42\n45  46     47 </success>
          <attributes>[1]</attributes>
        </int_cell>
        <uint_cell>
          <try>   42\n45  46     47 </try>
          <success>\n45  46     47     4</success>
          <attributes>[42]</attributes>
        </uint_cell>
        <int_cell>
          <try>45  46     47     48</try>
          <success>  46     47     48  </success>
          <attributes>[45]</attributes>
        </int_cell>
        <double_cell>
          <try>  46     47     48  </try>
          <success>     47     48  49\n</success>
          <attributes>[46]</attributes>
        </double_cell>
        <double_cell>
          <try>     47     48  49\n</try>
          <success>     48  49\n</success>
          <attributes>[47]</attributes>
        </double_cell>
        <int_cell>
          <try>     48  49\n</try>
          <success>  49\n</success>
          <attributes>[48]</attributes>
        </int_cell>
        <uint_cell>
          <try>  49\n</try>
          <success>\n</success>
          <attributes>[49]</attributes>
        </uint_cell>
        <int_cell>
          <try></try>
          <fail/>
        </int_cell>
        <success></success>
        <attributes>[[[12, 1.3, 23445, 1, 42], [45, 46, 47, 48, 49], []]]</attributes><locals>((column mappings) 1)</locals>
      </csv>
      <success></success>
      <attributes>[[[12, 1.3, 23445, 1, 42], [45, 46, 47, 48, 49], []]]</attributes>
    </start>