I was recently trying to implement a simplest tokenizer using boost spirit x3. The challenge I'm struggling with right now is retrieving the position of each token in the input stream.
There is a good tutorial about annotation on the official website: https://www.boost.org/doc/libs/develop/libs/spirit/doc/x3/html/spirit_x3/tutorials/annotation.html. However, it has some limitations: it basically parses a list of identical(homogeneous) essences, while it is often not really the case in real life.
So I was trying to create the tokenizer with 2 essences: whitespace(sequence of spaces) and a single-line comment (starts with //
, continues till the end of the line).
See the minimal example code at the end of the question.
However, I'm getting errors while trying to retrieve the position of any of the tokens. After some debugging I discovered that annotate_position::on_success
handle deduces T
type to be boost::spirit::x3::unused_type
, but I don't know why.
So, I have several questions:
SingleLineComment
and Whitespace
classes with no success. I suspect that that's because of omitting comment and whitespace strings in parser, is there a way to get around this?grammar
class or spirit::lex
, however there are not such things in x3 version)Here is a minimal example code piece:
#include <string>
#include <iostream>
#include <functional>
#include <vector>
#include <optional>
#include <variant>
#include <boost/spirit/home/x3.hpp>
#include <boost/spirit/home/x3/support/ast/position_tagged.hpp>
using namespace std;
namespace x3 = boost::spirit::x3;
struct position_cache_tag;
// copy paste from boost documentation example
struct annotate_position
{
template <typename T, typename Iterator, typename Context>
inline void on_success(Iterator const &first, Iterator const &last, T &ast, Context const &context)
{
auto &position_cache = x3::get<position_cache_tag>(context).get();
position_cache.annotate(ast, first, last);
}
};
struct SingleLineComment : public x3::position_tagged
{
// no need to store actual comment string,
// since it is position tagged and
// we can then find the corresponding
// iterators afterwards, is this right?
};
struct Whitespace : public x3::position_tagged
{
// same reasoning
};
// here can be another token types (e.g. MultilineComment, integer, identifier etc.)
struct Token : public x3::position_tagged
{
// unites SingleLineComment and Whitespace
// into a single Token class
enum class Type
{
SingleLineComment,
Whitespace
};
std::optional<Type> type; // type field should be set by semantic action
// std::optional is kind of reinsurance that type will be set
std::optional<std::variant<SingleLineComment, Whitespace>> data;
// same reasoning for std::optional
// this filed might be needed for more complex
// tokens, which hold additional data
};
// unique on success hook classes
struct SingleLineCommentHook : public annotate_position
{
};
struct WhitespaceHook : public annotate_position
{
};
struct TokenHook : public annotate_position
{
};
// rules
const x3::rule<SingleLineCommentHook, SingleLineComment> singleLineComment = "single line comment";
const x3::rule<WhitespaceHook, Whitespace> whitespace = "whitespace";
const x3::rule<TokenHook, Token> token = "token";
// rule definitions
const auto singleLineComment_def = x3::lit("//") >> x3::omit[*(x3::char_ - '\n')];
const auto whitespace_def = x3::omit[+x3::ascii::space];
BOOST_SPIRIT_DEFINE(singleLineComment, whitespace);
auto _setSingleLineComment = [](const auto &context) {
x3::_val(context).type = Token::Type::SingleLineComment;
x3::_val(context).data = x3::_attr(context);
};
auto _setWhitespace = [](const auto &context) {
x3::_val(context).type = Token::Type::Whitespace;
x3::_val(context).data = x3::_attr(context);
};
const auto token_def = (singleLineComment[_setSingleLineComment] | whitespace[_setWhitespace]);
BOOST_SPIRIT_DEFINE(token);
int main()
{
// copy paste from boost documentation example
using iterator_type = std::string::const_iterator;
using position_cache = boost::spirit::x3::position_cache<std::vector<iterator_type>>;
std::string content = R"(// first single line comment
// second single line comment
)";
// expect 4 tokens: comment -> whitespace -> comment -> whitespace
position_cache positions{content.cbegin(), content.cend()};
std::vector<Token> tokens;
const auto parser = x3::with<position_cache_tag>(std::ref(positions))[*token];
auto start = content.cbegin();
auto success = x3::phrase_parse(start, content.cend(), parser, x3::eps(false), tokens);
success &= (start == content.cend());
cout << boolalpha << success << endl;
cout << "Found " << tokens.size() << " tokens" << endl;
for (auto &token : tokens)
cout << (token.type.value() == Token::Type::SingleLineComment ? "comment" : "space") << endl;
// all good till this point
// now I want to get a position
// the following throws
auto pos = positions.position_of(tokens.front());
}
Thanks for reading, looking forward to any replies!
The on_success
seems to not-happen when semantic actions are involved.
In fact you're redundantly tagging the Ast nodes and the variant.
You could already get the correct result for the first token with e.g.
auto pos = positions.position_of(
std::get<SingleLineComment>(tokens.front().data)));
That's obviously not very convenient due to the static type switching required.
Here's a much simplified:
#include <iostream>
#include <iomanip>
#include <variant>
#include <boost/spirit/home/x3.hpp>
#include <boost/spirit/home/x3/support/ast/position_tagged.hpp>
namespace x3 = boost::spirit::x3;
struct SingleLineComment{};
struct Whitespace {};
using Variant = std::variant<SingleLineComment, Whitespace>;
struct Token : Variant, x3::position_tagged {
using Variant::Variant;
};
namespace {
struct position_cache_tag;
namespace Parser {
struct annotate_position {
template <typename T, typename Iterator, typename Context>
inline void on_success(Iterator first, Iterator last, T &ast, Context const &context) const {
auto &position_cache = x3::get<position_cache_tag>(context);
position_cache.annotate(ast, first, last);
}
};
// unique on success hook classes
template <typename> struct Hook {}; // no annotate_position mix-in
template <> struct Hook<Token> : annotate_position {};
template <typename T>
static auto constexpr as = [](auto p, char const* name = typeid(decltype(p)).name()) {
return x3::rule<Hook<T>, T> {name} = p;
};
// rule definitions
auto singleLineComment = as<SingleLineComment>("//" >> x3::omit[*(x3::char_ - x3::eol)]);
auto whitespace = as<Whitespace> (x3::omit[+x3::ascii::space]);
auto token = as<Token> (singleLineComment | whitespace, "token");
}
}
int main() {
using It = std::string::const_iterator;
using position_cache = x3::position_cache<std::vector<It>>;
std::string const content = R"(// first single line comment
// second single line comment
)";
position_cache positions{content.begin(), content.end()};
auto parser = x3::with<position_cache_tag>(positions)[*Parser::token];
std::vector<Token> tokens;
if (parse(begin(content), end(content), parser >> x3::eoi, tokens)) {
std::cout << "Found " << tokens.size() << " tokens" << std::endl;
for (auto& token : tokens) {
auto pos = positions.position_of(token);
std::cout
<< (token.index() ? "space" : "comment") << "\t"
<< std::quoted(std::string_view(&*pos.begin(), pos.size()))
<< std::endl;
}
}
}
Prints
Found 4 tokens
comment "// first single line comment"
space "
"
comment "// second single line comment"
space "
"