Search code examples
c++boostc++17boost-spiritboost-tokenizer

Tokenize a "Braced Initializer List"-Style String in C++ (With Boost?)


I have a string (nested strings even) that are formatted like a C++ braced initializer list. I want to tokenize them one level at a time into a vector of strings.

So when I input "{one, two, three}" to the function should output a three element vector

"one",

"two",

"three"

To complicate this, it needs to support quoted tokens and preserve nested lists:

Input String: "{one, {2, \"three four\"}}, \"five, six\", {\"seven, eight\"}}"

Output is a four element vector:

"one",

"{2, \"three four\"}",

"five, six",

"{\"seven, eight\"}"

I've looked at a few other SO posts:

Using Boost Tokenizer escaped_list_separator with different parameters

Boost split not traversing inside of parenthesis or braces

And used those to start a solution, but this seems slightly too complicated for the tokenizer (because of the braces):

#include <boost/algorithm/string.hpp>
#include <boost/tokenizer.hpp>

std::vector<std::string> TokenizeBracedList(const std::string& x)
{
  std::vector<std::string> tokens;

  std::string separator1("");
  std::string separator2(",\n\t\r");
  std::string separator3("\"\'");

  boost::escaped_list_separator<char> elements(separator1, separator2, separator3);
  boost::tokenizer<boost::escaped_list_separator<char>> tokenizer(x, elements);

  for(auto i = std::begin(tokenizer); i != std::end(tokenizer); ++i)
  {
    auto token = *i;
    boost::algorithm::trim(token);
    tokens.push_back(token);
  }

  return tokens;
}

With this, even in the trivial case, it doesn't strip the opening and closing braces.

Boost and C++17 are fair game for a solution.


Solution

  • Simple (Flat) Take

    Defining a flat data structure like:

    using token  = std::string;
    using tokens = std::vector<token>;
    

    We can define an X3 parser like:

    namespace Parser {
        using namespace boost::spirit::x3;
    
        rule<struct list_, token> item;
    
        auto quoted   = lexeme [ '"' >> *('\\' >> char_ | ~char_('"')) >> '"' ];
        auto bare     = lexeme [ +(graph-','-'}') ];
    
        auto list     = '{' >> (item % ',') >> '}';
        auto sublist  = raw [ list ];
    
        auto item_def = sublist | quoted | bare;
    
        BOOST_SPIRIT_DEFINE(item)
    }
    

    Live On Wandbox

    #include <boost/spirit/home/x3.hpp>
    #include <iostream>
    #include <iomanip>
    
    using token  = std::string;
    using tokens = std::vector<token>;
    
    namespace x3 = boost::spirit::x3;
    
    namespace Parser {
        using namespace boost::spirit::x3;
    
        rule<struct list_, token> item;
    
        auto quoted   = lexeme [ '"' >> *('\\' >> char_ | ~char_('"')) >> '"' ];
        auto bare     = lexeme [ +(graph-','-'}') ];
    
        auto list     = '{' >> (item % ',') >> '}';
        auto sublist  = raw [ list ];
    
        auto item_def = sublist | quoted | bare;
    
        BOOST_SPIRIT_DEFINE(item)
    }
    
    int main() {
        for (std::string const input : {
                R"({one, "five, six"})",
                R"({one, {2, "three four"}, "five, six", {"seven, eight"}})",
            })
        {
            auto f = input.begin(), l = input.end();
    
            std::vector<std::string> parsed;
            bool ok = phrase_parse(f, l, Parser::list, x3::space, parsed);
    
            if (ok) {
                std::cout << "Parsed: " << parsed.size() << " elements\n";
                for (auto& el : parsed) {
                    std::cout << " - " << std::quoted(el, '\'') << "\n";
                }
            } else {
                std::cout << "Parse failed\n";
            }
    
            if (f != l)
                std::cout << "Remaining unparsed: " << std::quoted(std::string{f, l}) << "\n";
        }
    }
    

    Prints

    Parsed: 2 elements
     - 'one'
     - 'five, six'
    Parsed: 4 elements
     - 'one'
     - '{2, "three four"}'
     - 'five, six'
     - '{"seven, eight"}'
    

    Nested Data

    Changing the datastructure to be a bit more specific/realistic:

    namespace ast {
        using value = boost::make_recursive_variant<
                double,
                std::string,
                std::vector<boost::recursive_variant_>
            >::type;
        using list = std::vector<value>;
    }
    

    Now we can change the grammar, as we no longer need to treat sublist as if it is a string:

    namespace Parser {
        using namespace boost::spirit::x3;
    
        rule<struct item_, ast::value> item;
    
        auto quoted   = lexeme [ '"' >> *('\\' >> char_ | ~char_('"')) >> '"' ];
        auto bare     = lexeme [ +(graph-','-'}') ];
    
        auto list     = x3::rule<struct list_, ast::list> {"list" }
                      = '{' >> (item % ',') >> '}';
    
        auto item_def = list | double_ | quoted | bare;
    
        BOOST_SPIRIT_DEFINE(item)
    }
    

    Everything "still works": Live On Wandbox

    #include <boost/spirit/home/x3.hpp>
    #include <iostream>
    #include <iomanip>
    
    namespace ast {
        using value = boost::make_recursive_variant<
                double,
                std::string,
                std::vector<boost::recursive_variant_>
            >::type;
        using list = std::vector<value>;
    }
    
    namespace x3 = boost::spirit::x3;
    
    namespace Parser {
        using namespace boost::spirit::x3;
    
        rule<struct item_, ast::value> item;
    
        auto quoted   = lexeme [ '"' >> *('\\' >> char_ | ~char_('"')) >> '"' ];
        auto bare     = lexeme [ +(graph-','-'}') ];
    
        auto list     = x3::rule<struct list_, ast::list> {"list" }
                      = '{' >> (item % ',') >> '}';
    
        auto item_def = list | double_ | quoted | bare;
    
        BOOST_SPIRIT_DEFINE(item)
    }
    
    struct pretty_printer {
        using result_type = void;
        std::ostream& _os;
        int _indent;
    
        pretty_printer(std::ostream& os, int indent = 0) : _os(os), _indent(indent) {}
    
        void operator()(ast::value const& v) { boost::apply_visitor(*this, v); }
    
        void operator()(double v)            { _os << v; }
        void operator()(std::string s)       { _os << std::quoted(s); }
        void operator()(ast::list const& l)  {
            _os << "{\n";
            _indent += 2;
            for (auto& item : l) {
                _os << std::setw(_indent) << "";
                operator()(item);
               _os << ",\n";
            }
            _indent -= 2;
            _os << std::setw(_indent) << "" << "}";
        }
    };
    
    int main() {
        pretty_printer print{std::cout};
    
        for (std::string const input : {
                R"({one, "five, six"})",
                R"({one, {2, "three four"}, "five, six", {"seven, eight"}})",
            })
        {
            auto f = input.begin(), l = input.end();
    
            ast::value parsed;
            bool ok = phrase_parse(f, l, Parser::item, x3::space, parsed);
    
            if (ok) {
                std::cout << "Parsed: ";
                print(parsed);
                std::cout << "\n";
            } else {
                std::cout << "Parse failed\n";
            }
    
            if (f != l)
                std::cout << "Remaining unparsed: " << std::quoted(std::string{f, l}) << "\n";
        }
    }
    

    Prints:

    Parsed: {
      "one",
      "five, six",
    }
    Parsed: {
      "one",
      {
        2,
        "three four",
      },
      "five, six",
      {
        "seven, eight",
      },
    }