c++csv boost boost-spirit boost-spirit-qi

parse typed csv file with boost::spirit::qi

I want to parse a CSV-File with typed values. The type of every column is defined in the header, e.x.:

int double double int unsigned
12  1.3    23445  1   42
45  46     47     48  49

The result data structure may be something like this 2-dimensional vector:

using ColumnType = boost::variant<
  std::vector<int>,
  std::vector<unsigned>,
  std::vector<double>
>;

using ResultType = std::vector<ColumnType>;

My working code:

namespace phoenix = boost::phoenix;
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;

using ColumnType = boost::variant<
  std::vector<int>,
  std::vector<unsigned>,
  std::vector<double>
>;

using ResultType = std::vector<ColumnType>;

enum class CSVDataType
{
  Int, UInt, Double
};

template<typename Iterator>
struct TypedCSVGrammar: qi::grammar<Iterator, ResultType(), ascii::blank_type> {
  struct types_: qi::symbols<char, CSVDataType> {
    types_() {
      add
        ("int", CSVDataType::Int)
        ("unsigned", CSVDataType::UInt)
        ("double", CSVDataType::Double);
    }
  } types;

  TypedCSVGrammar() :
    TypedCSVGrammar::base_type(csv, "csv")
  {
    using ascii::string;
    using namespace qi::labels;

    header %= *(types);

    cell =
      (
        qi::eps(phoenix::ref(column_types)[_r1] == phoenix::val(CSVDataType::Int))
        >> qi::int_ [phoenix::bind(&TypedCSVGrammar::add_int, this, _r1, _1)]
      ) | (
        qi::eps(phoenix::ref(column_types)[_r1] == phoenix::val(CSVDataType::UInt))
        >> qi::uint_ [phoenix::bind(&TypedCSVGrammar::add_uint, this, _r1, _1)]
      ) | (
        qi::eps(phoenix::ref(column_types)[_r1] == phoenix::val(CSVDataType::Double))
        >> qi::double_ [phoenix::bind(&TypedCSVGrammar::add_double, this, _r1, _1)]
      );

    row =
      qi::eps [phoenix::ref(column) = phoenix::val(0)]
      >> qi::repeat(phoenix::size(phoenix::ref(column_types))) [
        cell(phoenix::ref(column))
        >> qi::eps [phoenix::ref(column)++]
      ];

    csv =
      header [phoenix::bind(&TypedCSVGrammar::construct_columns, this, _1)]
      > qi::eol
      > row % qi::eol
      > *qi::eol
      > qi::attr(result);

    qi::on_error<qi::fail>
    (
        csv
      , std::cout
            << phoenix::val("Error! Expecting ")
            << _4                               // what failed?
            << phoenix::val(" here: \"")
            << phoenix::construct<std::string>(_3, _2)   // iterators to error-pos, end
            << phoenix::val("\"")
            << std::endl
    );
  }

  void add_int(std::size_t c, int i) {
    boost::get<std::vector<int>>(result[c]).push_back(i);
  }    
  void add_uint(std::size_t c, unsigned i) {
    boost::get<std::vector<unsigned>>(result[c]).push_back(i);
  }    
  void add_double(std::size_t c, double i) {
    boost::get<std::vector<double>>(result[c]).push_back(i);
  }

  void construct_columns(const std::vector<CSVDataType>& columns) {
    column_types = columns;    
    for (const auto& c : columns) {
      switch (c) {
      case CSVDataType::Int:
        result.push_back(std::vector<int>());
        break;

      case CSVDataType::UInt:
        result.push_back(std::vector<unsigned>());
        break;

      case CSVDataType::Double:
        result.push_back(std::vector<double>());
        break;
      }
    }
  }

  std::vector<CSVDataType> column_types;
  std::size_t column;
  ResultType result;

  qi::rule<Iterator, ResultType(), ascii::blank_type> csv;    
  qi::rule<Iterator, std::vector<CSVDataType>(), ascii::blank_type> header;
  qi::rule<Iterator, void(std::size_t), ascii::blank_type> cell;
  qi::rule<Iterator, void(), ascii::blank_type> row;
};

Is there any better solution? I want to use more than just 3 types (maybe more than 10 types). This would be a lot of typing.

Solution

I don't see why you would come up with such a contrived target data structure. It seems to invite errors with unmatched indices.

May I suggest a Nabialek Trick here.

It works well if you change the AST around to:

using ValueType = boost::variant<int, unsigned, double>;
using ResultType = std::vector<std::vector<ValueType>>;

(This seems like a more desirable approach anyways)

In short, you translate the column types into a vector of parser rules (std::vector<dynamic>).

Live On Coliru

#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>

namespace px    = boost::phoenix;
namespace qi    = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;

using ValueType = boost::variant<int, unsigned, double>;
using ResultType = std::vector<std::vector<ValueType>>;

enum class CSVDataType { Int, UInt, Double };

namespace boost { namespace spirit { namespace qi { // FOR DEBUG
    template <typename... T> std::ostream& operator<<(std::ostream& os, rule<T...> const*)                     { return os << "(lazy rule)";       }
    template <typename... T> std::ostream& operator<<(std::ostream& os, std::vector<rule<T...> const*> const&) { return os << "(column mappings)"; }
} } }

template<typename Iterator, typename Skipper = ascii::blank_type>
struct TypedCSVGrammar: qi::grammar<Iterator, ResultType(), Skipper> {

    TypedCSVGrammar() : TypedCSVGrammar::base_type(start, "csv")
    {
        using namespace qi::labels;

        header = *types;

        csv    = qi::omit[ header [ _cols = _1 ] ] > qi::eol
               > qi::repeat(_current=0, px::size(_cols)) [ qi::lazy(*_cols[_current++]) ] % qi::eol
               > *qi::eol
               ;

        start = csv;

        BOOST_SPIRIT_DEBUG_NODES((start)(csv)(header));

        qi::on_error<qi::fail> (csv, px::ref(std::cout)
                    << "Error! Expecting " << _4                                  // what failed?
                    << " here: \""         << px::construct<std::string>(_3, _2)  // iterators to error-pos, end
                    << "\"\n"
            );
    }

  private:
    using cell_parser_t = qi::rule<Iterator, ValueType(), Skipper>;
    using dynamic       = cell_parser_t const*;

    struct types_: qi::symbols<char, dynamic> {
        cell_parser_t
            int_cell    = qi::int_,
            uint_cell   = qi::uint_,
            double_cell = qi::double_;

        types_() {
            this->add
                ("int",      &int_cell)
                ("unsigned", &uint_cell)
                ("double",   &double_cell);
            BOOST_SPIRIT_DEBUG_NODES((int_cell)(uint_cell)(double_cell))
        }
    } types;

    using state = qi::locals<std::vector<dynamic>, size_t>;
    qi::_a_type _cols;
    qi::_b_type _current;

    qi::rule<Iterator, ResultType(),             Skipper> start;
    qi::rule<Iterator, std::vector<dynamic>(),   Skipper> header;
    qi::rule<Iterator, ResultType(),             Skipper, state>  csv;
};

int main() {
    using It = boost::spirit::istream_iterator;

    It f(std::cin >> std::noskipws), l;
    TypedCSVGrammar<It> g;
    ResultType data;
    bool ok = qi::phrase_parse(f, l, g, ascii::blank, data);
    if (ok) {
        std::cout << "Parse success\n";

        for(auto& row: data) {
            for(auto& cell: row) std::cout << cell << "\t";
            std::cout << "\n";
        }
    }
    else
        std::cout << "Parse failed\n";

    if (f!=l)
        std::cout << "Remaining unparsed: '" << std::string(f,l) << "'\n";
}

So for the input shown it prints

Parse success
12  1.3 23445   1   42  
45  46  47  48  49

And the debug information if

<start>
  <try>int double double in</try>
  <csv>
    <try>int double double in</try>
    <header>
      <try>int double double in</try>
      <success>\n12  1.3    23445  1</success>
      <attributes>[[(lazy rule), (lazy rule), (lazy rule), (lazy rule), (lazy rule)]]</attributes>
    </header>
    <int_cell>
      <try>12  1.3    23445  1 </try>
      <success>  1.3    23445  1   </success>
      <attributes>[12]</attributes>
    </int_cell>
    <double_cell>
      <try>  1.3    23445  1   </try>
      <success>    23445  1   42\n45</success>
      <attributes>[1.3]</attributes>
    </double_cell>
    <double_cell>
      <try>    23445  1   42\n45</try>
      <success>  1   42\n45  46     </success>
      <attributes>[23445]</attributes>
    </double_cell>
    <int_cell>
      <try>  1   42\n45  46     </try>
      <success>   42\n45  46     47 </success>
      <attributes>[1]</attributes>
    </int_cell>
    <uint_cell>
      <try>   42\n45  46     47 </try>
      <success>\n45  46     47     4</success>
      <attributes>[42]</attributes>
    </uint_cell>
    <int_cell>
      <try>45  46     47     48</try>
      <success>  46     47     48  </success>
      <attributes>[45]</attributes>
    </int_cell>
    <double_cell>
      <try>  46     47     48  </try>
      <success>     47     48  49\n</success>
      <attributes>[46]</attributes>
    </double_cell>
    <double_cell>
      <try>     47     48  49\n</try>
      <success>     48  49\n</success>
      <attributes>[47]</attributes>
    </double_cell>
    <int_cell>
      <try>     48  49\n</try>
      <success>  49\n</success>
      <attributes>[48]</attributes>
    </int_cell>
    <uint_cell>
      <try>  49\n</try>
      <success>\n</success>
      <attributes>[49]</attributes>
    </uint_cell>
    <int_cell>
      <try></try>
      <fail/>
    </int_cell>
    <success></success>
    <attributes>[[[12, 1.3, 23445, 1, 42], [45, 46, 47, 48, 49], []]]</attributes><locals>((column mappings) 1)</locals>
  </csv>
  <success></success>
  <attributes>[[[12, 1.3, 23445, 1, 42], [45, 46, 47, 48, 49], []]]</attributes>
</start>