Search code examples
c++parsingstliteratorboost-spirit

parsing fails when using istream iterator


I'm using boost::spirit to parse csv input (please don't suggest an alternative, this is just testing). When I read the contents of stdin to a string and iterate over that, the parsing succeeds; however, when the contents of std::cin are read directly (through a wrapper that I wrote myself because phrase_parse requires an iterator inheriting from std::iterator<std::forward_iterator_tag, T>, and std::istream_iterator<T> doesn't do that), the parsing fails, and I can't figure out why, since the debugging output seems to suggest that the same text is parsed in both situations, with different results.

I even tried iterating over std::cin and putting that into a string, and that parsed correctly; I don't understand why the type of iterator provided is affecting the result. Here's the example I'm working off of (sorry it's so large, but you can plug it in and compile it easily). Try defining the macros SECTION_STRINGSTREAM (succeeds) or SECTION_CIN (fails) to observe the strange behavior (the default behavior (succeeds) is when std::cin is read to a string).

If you compile and run this with echo "\"f\",111,222,333,\"ref_type\",\"spc\",\"type\",\"lan\",\"name\",\"scop\"" | ./spirit_csv, the debug output clearly shows the entire string is being parsed. I also added if (++start == end) std::cerr << "woah"; and that is being tripped in all situations, so it seems that it definitely is parsing to the end of the input.

// following example from:
// http://www.boost.org/doc/libs/1_58_0/libs/spirit/example/qi/employee.cpp, and
// num_list4.cpp, and others

#define BOOST_SPIRIT_DEBUG 1
#define BOOST_SPIRIT_DEBUG_PRINT_SOME 200
#define BOOST_SPIRIT_DEBUG_OUT std::cerr

// std includes
#include <iostream>
#include <string>
// boost includes
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix_stl.hpp>

namespace frontend {
namespace spirit = boost::spirit;
namespace qi = spirit::qi;
namespace ascii = spirit::ascii;

struct cursor {
  std::string file;
  unsigned long long offset;
  unsigned long long line;
  unsigned long long col;
  // verify inputs using enum
  // decl/ref/defn/call
  std::string reference_type;
  // variable/function/scope/label/type
  std::string specifier;
  // if variable/function, then type
  std::string type;
  std::string language;
  std::string name;
  std::string scope;
};
}

// adapt struct to boost fusion
BOOST_FUSION_ADAPT_STRUCT(frontend::cursor, (std::string, file),
                          (unsigned long long, offset),
                          (unsigned long long, line), (unsigned long long, col),
                          (std::string, reference_type),
                          (std::string, specifier), (std::string, type),
                          (std::string, language), (std::string, name),
                          (std::string, scope));

// note: blank_type is so that newlines aren't counted as skippable, because
// they are significant for csv! however, typically you'll be wanting to use
// boost::spirit::ascii::space as your whitespace operator if you really do not
// care about whitespace
namespace frontend {
template <typename Iterator>
struct cursor_parser
    : public qi::grammar<Iterator, std::vector<cursor>(), qi::blank_type> {
  qi::rule<Iterator, std::string(), qi::blank_type> quoted_string;
  qi::rule<Iterator, cursor(), qi::blank_type> start;
  qi::rule<Iterator, std::vector<cursor>(), qi::blank_type> vec;

  cursor_parser() : cursor_parser::base_type(vec) {
    using qi::uint_;
    using qi::eol;
    using qi::lexeme;
    using qi::_1;
    using ascii::char_;
    using boost::phoenix::push_back;
    using boost::phoenix::ref;
    using boost::spirit::_val;

    quoted_string %= lexeme['"' >> *(char_ - '"') >> '"'];

    start %=
        // file
        quoted_string >> ',' >>
        // offset
        uint_ >> ',' >>
        // line
        uint_ >> ',' >>
        // col
        uint_ >> ',' >>
        // reference_type
        quoted_string >> ',' >>
        // specifier
        quoted_string >> ',' >>
        // type
        quoted_string >> ',' >>
        // language
        quoted_string >> ',' >>
        // name
        quoted_string >> ',' >>
        // scope
        quoted_string;

    vec %= start % eol;

    quoted_string.name("qs");
    debug(quoted_string);
    start.name("s");
    debug(start);
    vec.name("v");
    debug(vec);
  }
};

template <typename T>
class cin_forward_iterator : std::iterator<std::forward_iterator_tag, T> {
private:
  std::istream_iterator<T> i;

public:
  cin_forward_iterator() : i(std::istream_iterator<T>()) {}
  cin_forward_iterator(std::istream &in) : i(std::istream_iterator<T>(in)) {}
  const T &operator*() const { return *i; }
  cin_forward_iterator<T> operator++() {
    ++i;
    return *this;
  };
  cin_forward_iterator<T> operator++(int) {
    cin_forward_iterator<T> tmp = *this;
    i++;
    return tmp;
  };
  bool operator==(const cin_forward_iterator<T> &rhs) const {
    return i == rhs.i;
  }
  bool operator!=(const cin_forward_iterator<T> &rhs) const {
    return not(*this == rhs);
  }
};
}

namespace std {
template <typename T> class iterator_traits<frontend::cin_forward_iterator<T>> {
public:
  typedef typename std::istream_iterator<T>::value_type value_type;
  typedef typename std::istream_iterator<T>::difference_type difference_type;
  typedef typename std::istream_iterator<T>::reference reference;
  typedef typename std::istream_iterator<T>::pointer pointer;
  typedef std::forward_iterator_tag iterator_category;
};
}

/* try:
echo \
  "\"f\",111,222,333,\"ref_type\",\"spc\",\"type\",\"lan\",\"name\",\"scop\"" \
  | ./spirit_csv
*/
int main() {
  std::vector<frontend::cursor> v;
// succeeds
#ifdef SECTION_STRINGSTREAM
  std::stringstream ss;
  ss << std::cin.rdbuf();
  std::string s(ss.str());
  auto start = s.cbegin();
  auto end = s.cend();
// fails
#elif SECTION_CIN
  noskipws(std::cin);
  frontend::cin_forward_iterator<char> start(std::cin);
  frontend::cin_forward_iterator<char> end;
// succeeds
#else
  noskipws(std::cin);
  frontend::cin_forward_iterator<char> start_in(std::cin);
  frontend::cin_forward_iterator<char> end_in;
  std::string s;
  for (; start_in != end_in; ++start_in) {
    s += *start_in;
  }
  auto start = s.begin();
  auto end = s.end();
#endif
  if (phrase_parse(start, end,
#ifdef SECTION_STRINGSTREAM
                   frontend::cursor_parser<std::string::const_iterator>(),
#elif SECTION_CIN
                   frontend::cursor_parser<
                       frontend::cin_forward_iterator<char>>(),
#else
                   frontend::cursor_parser<std::string::iterator>(),
#endif
                   boost::spirit::qi::blank, v)) {
    for (auto &c : v) {
      std::cout << boost::fusion::as_vector(c) << std::endl;
    }
    std::cerr << "success!" << std::endl;
    return 0;
  } else {
    std::cerr << "failure!" << std::endl;
    return 1;
  }
}

Solution

  • Why do you have your own iterator?

    It's hard to get right, and it certainly doesn't look like you made it multi-pass aware.

    There is a reason why input iterators have a different category than forward iterators! Just papering over it doesn't help. Forward iterators MUST be copyable and have repeatable values on dereference. Input iterators don't satisfy those criteria.

    In fact you should either just use boost::spirit::istream_iterator or you could compose an iterator using Spirit's multi_pass adaptor:

    Here's a fixed and cleaned-up version:

    Live On Coliru

    #define BOOST_SPIRIT_DEBUG 1
    #define BOOST_SPIRIT_DEBUG_PRINT_SOME 200
    #define BOOST_SPIRIT_DEBUG_OUT std::cerr
    
    // std includes
    #include <iostream>
    #include <string>
    // boost includes
    #include <boost/spirit/include/qi.hpp>
    #include <boost/fusion/adapted.hpp>
    #include <boost/fusion/include/as_vector.hpp>
    
    namespace frontend {
        namespace qi     = boost::spirit::qi;
    
        struct cursor {
            std::string file;
            unsigned long long offset;
            unsigned long long line;
            unsigned long long col;
            // verify inputs using enum
            // decl/ref/defn/call
            std::string reference_type;
            // variable/function/scope/label/type
            std::string specifier;
            // if variable/function, then type
            std::string type;
            std::string language;
            std::string name;
            std::string scope;
        };
    }
    
    // adapt struct to boost fusion
    BOOST_FUSION_ADAPT_STRUCT(frontend::cursor, 
            (std::string, file)
            (unsigned long long, offset)
            (unsigned long long, line)
            (unsigned long long, col)
            (std::string, reference_type)
            (std::string, specifier)
            (std::string, type)
            (std::string, language)
            (std::string, name)
            (std::string, scope))
    
    namespace frontend {
    
        // NOTE: blank_type doesn't skip newlines
        template <typename Iterator>
        struct cursor_parser : public qi::grammar<Iterator, std::vector<cursor>(), qi::blank_type> {
    
            cursor_parser() : cursor_parser::base_type(vec) {
                using qi::uint_;
                using qi::eol;
                using qi::lexeme;
                using qi::char_;
    
                quoted_string %= lexeme['"' >> *(char_ - '"') >> '"'];
    
                start %=
                    quoted_string  >> ','   >>  // file
                    uint_          >> ','   >>  // offset
                    uint_          >> ','   >>  // line
                    uint_          >> ','   >>  // col
                    quoted_string  >> ','   >>  // reference_type
                    quoted_string  >> ','   >>  // specifier
                    quoted_string  >> ','   >>  // type
                    quoted_string  >> ','   >>  // language
                    quoted_string  >> ','   >>  // name
                    quoted_string; // scope
    
                vec %= start % eol;
    
                BOOST_SPIRIT_DEBUG_NODES((quoted_string)(start)(vec))
            }
    
            private:
            qi::rule<Iterator, std::string()        , qi::blank_type> quoted_string;
            qi::rule<Iterator, cursor()             , qi::blank_type> start;
            qi::rule<Iterator, std::vector<cursor>(), qi::blank_type> vec;
        };
    }
    
    int main() {
        // '"f",111,222,333,"ref_type","spc","type","lan","name","scop"'
        using It = boost::spirit::istream_iterator;
    
        It start_in(std::cin >> std::noskipws), end_in;
        std::vector<frontend::cursor> v;
    
        if (phrase_parse(start_in, end_in, frontend::cursor_parser<It>(), frontend::qi::blank, v)) {
            for (auto &c : v) {
                std::cout << boost::fusion::as_vector(c) << std::endl;
            }
            std::cerr << "success!" << std::endl;
        } else {
            std::cerr << "failure!" << std::endl;
            return 1;
        }
    }
    

    Output

    (f 111 222 333 ref_type spc type lan name scop)
    success!
    

    Debug output:

    <vec>
      <try>"f",111,222,333,"ref_type","spc","type","lan","name","scop"\n</try>
      <start>
        <try>"f",111,222,333,"ref_type","spc","type","lan","name","scop"\n</try>
        <quoted_string>
          <try>"f",111,222,333,"ref_type","spc","type","lan","name","scop"\n</try>
          <success>,111,222,333,"ref_type","spc","type","lan","name","scop"\n</success>
          <attributes>[[f]]</attributes>
        </quoted_string>
        <quoted_string>
          <try>"ref_type","spc","type","lan","name","scop"\n</try>
          <success>,"spc","type","lan","name","scop"\n</success>
          <attributes>[[r, e, f, _, t, y, p, e]]</attributes>
        </quoted_string>
        <quoted_string>
          <try>"spc","type","lan","name","scop"\n</try>
          <success>,"type","lan","name","scop"\n</success>
          <attributes>[[s, p, c]]</attributes>
        </quoted_string>
        <quoted_string>
          <try>"type","lan","name","scop"\n</try>
          <success>,"lan","name","scop"\n</success>
          <attributes>[[t, y, p, e]]</attributes>
        </quoted_string>
        <quoted_string>
          <try>"lan","name","scop"\n</try>
          <success>,"name","scop"\n</success>
          <attributes>[[l, a, n]]</attributes>
        </quoted_string>
        <quoted_string>
          <try>"name","scop"\n</try>
          <success>,"scop"\n</success>
          <attributes>[[n, a, m, e]]</attributes>
        </quoted_string>
        <quoted_string>
          <try>"scop"\n</try>
          <success>\n</success>
          <attributes>[[s, c, o, p]]</attributes>
        </quoted_string>
        <success>\n</success>
        <attributes>[[[f], 111, 222, 333, [r, e, f, _, t, y, p, e], [s, p, c], [t, y, p, e], [l, a, n], [n, a, m, e], [s, c, o, p]]]</attributes>
      </start>
      <start>
        <try></try>
        <quoted_string>
          <try></try>
          <fail/>
        </quoted_string>
        <fail/>
      </start>
      <success>\n</success>
      <attributes>[[[[f], 111, 222, 333, [r, e, f, _, t, y, p, e], [s, p, c], [t, y, p, e], [l, a, n], [n, a, m, e], [s, c, o, p]]]]</attributes>
    </vec>
    

    Notes:

    • you had errors in the BOOST_FUSION_ADAPT_STRUCT macro invocation (too many commas)