I'm learning Boost Spirit by writing a parser that parses two variants of hex number used by NAMS:
0x
/0h
or prefix of h
/x
.$
and must be followed by a decimal digit.Here is what I have come up so far and with Coliru Session:
//#define BOOST_SPIRIT_X3_DEBUG
#include <iostream>
#include <boost/spirit/home/x3.hpp>
#include <boost/spirit/home/x3/support/ast/variant.hpp>
#include <boost/spirit/include/support_extended_variant.hpp>
namespace x3 = boost::spirit::x3;
namespace ast {
struct hex_data : std::string {};
struct pascal_hex_data : std::string {};
struct declared_data : boost::spirit::extended_variant<hex_data, pascal_hex_data>
{
declared_data () : base_type () { std::cout << "ctor default\n"; }
declared_data (hex_data const& rhs) : base_type (rhs) { std::cout << "ctor hex: " << rhs << "\n"; }
declared_data (pascal_hex_data const& rhs) : base_type (rhs) { std::cout << "ctor pascal: " << rhs << "\n"; }
};
} // namespace ast
typedef x3::rule<struct hex_digits_class, std::string> hex_digit_type;
typedef x3::rule<struct hex_data_class, ast::hex_data> hex_data_type;
typedef x3::rule<struct pascalhex_data_class, ast::pascal_hex_data> pascalhex_data_type;
typedef x3::rule<struct declared_data_class, ast::declared_data> declared_data_type;
const hex_data_type hex_data = "hex_data";
const hex_digit_type hex_digit = "hex_digit";
const pascalhex_data_type pascalhex_data = "pascal_hex_data";
const declared_data_type declared_data = "declared_data";
auto const hex_digit_def =
= x3::skip(x3::char_('_'))
[
x3::no_case
[
x3::char_ ('0', '9') | x3::char_ ("a", "f")
]
]
;
auto const hex_data_def
= x3::no_case[x3::lit ("0h") | "0x"] >> +hex_digit_def
| +hex_digit_def >> x3::no_case[x3::lit ("h") | "x"]
;
auto const pascalhex_data_def
= x3::lit ("$") >> x3::char_ ('0', '9') >> +hex_digit_def;
auto const declared_data_def
= hex_data_def
| pascalhex_data_def
;
BOOST_SPIRIT_DEFINE (hex_digit, hex_data, pascalhex_data, declared_data)
struct Visitor
{
using result_type = std::string;
std::string operator()(ast::hex_data const & v) const { return "hex_data"; }
std::string operator()(ast::pascal_hex_data const & v) const { return "pascal_hex_data"; }
};
int main()
{
std::string input = "$9";
ast::declared_data parsed;
bool r =
x3::parse (input.begin (), input.end (),
declared_data_def,
parsed);
std::cout << "r = " << r << "\n";
Visitor v;
std::cout << "result = " << boost::apply_visitor(v, parsed) << "\n";
}
However, the rule pascalhex_data_def
fails to compile with error message that looks like spirit is deducing the attribute of the rule to be a fusion tuple of char
and vector
of variant
even though the rule is specified to have attribute of an ast derived from string
:
typedef x3::rule<struct pascalhex_data_class, ast::pascal_hex_data> pascalhex_data_type;
Can anyone point out why the attribute deduced by boost is not what's specified? Anyway to force the rule to generate string rather than the tuple
boost is trying to return?
Your code seems extremely complicated for what it achieves. However, after looking at it for considerable time, I noticed you are declaring rules (which coerce their attribute types), but not using them at the crucial time:
auto const declared_data_def = hex_data_def | pascalhex_data_def;
This means you directly build an expression tree from the expression template (_def
) initializers, instead of the rules:
auto const declared_data_def = hex_data | pascalhex_data;
That compiles. It still leaves quite some issues:
you can/should do without the variant constructors:
struct declared_data : boost::spirit::extended_variant<hex_data, pascal_hex_data> {
using extended_variant::extended_variant;
};
You can write x3::char_ ('0', '9')
as x3::char_("0-9")
, so you can write
x3::no_case
[
x3::char_ ('0', '9') | x3::char_ ("a", "f")
]
instead as
x3::no_case [ x3::char_ ("0-9a-f") ]
or even
x3::char_ ("0-9a-fA-F")
or, maybe just:
x3::xdigit
hex_digits_type
declares a std::string
attribute, but parses only a a single character. Instead of using +hex_digits_def
, just use hex_digits
and write:
auto const hex_digits_def = x3::skip(x3::char_('_')) [ +x3::xdigit ];
your definition
"$" >> x3::char_("0-9") >> hex_digits
consumes the first digit of the hex number. That's leading to error (parsing the empty string for e.g. $9
). Instead you probably want to check with operator&
:
'$' >> &x3::char_("0-9") >> hex_digits
or, indeed:
'$' >> &x3::digit >> hex_digits
none of the rules are actually recursive, so none of them require any separation of declaration and definition. This reduces the code by a huge margin
I suspect you want to interpret the hex data as numbers, not string. You could/should probably simplify the AST accordingly. Step 1: drop the distinction between things parsed from 1 or the other format:
namespace ast {
using hex_literal = std::string;
}
Now the whole program simplifies to Live On Coliru
#include <iostream>
#include <boost/spirit/home/x3.hpp>
namespace ast {
using hex_literal = std::string;
}
namespace parser {
namespace x3 = boost::spirit::x3;
auto const hex_digits = x3::rule<struct hex_digits_class, ast::hex_literal> {"hex_digits"}
= x3::skip(x3::char_('_')) [ +x3::xdigit ];
auto const hex_qualifier = x3::omit [ x3::char_("hxHX") ];
auto const hex_literal =
('$' >> &x3::xdigit | '0' >> hex_qualifier) >> hex_digits
| hex_digits >> hex_qualifier;
}
int main()
{
for (std::string const input : {
"$9", "0x1b", "0h1c", "1dh", "1ex",
"$9_f", "0x1_fb", "0h1_fc", "1_fdh", "1_fex"
}) {
ast::hex_literal parsed;
bool r = parse(input.begin(), input.end(), parser::hex_literal, parsed);
std::cout << "r = " << std::boolalpha << r << ", result = " << parsed << "\n";
}
}
Printing:
r = true, result = 9
r = true, result = 1b
r = true, result = 1c
r = true, result = 1d
r = true, result = 1e
r = true, result = 9f
r = true, result = 1fb
r = true, result = 1fc
r = true, result = 1fd
r = true, result = 1fe
Now, it seems obvious that really, you want to know the numeric value:
#include <iostream>
#include <boost/spirit/home/x3.hpp>
namespace ast {
using hex_literal = uintmax_t;
}
namespace parser {
namespace x3 = boost::spirit::x3;
auto const hex_qualifier = x3::omit [ x3::char_("hxHX") ];
auto const hex_literal
= ('$' >> &x3::xdigit | '0' >> hex_qualifier) >> x3::hex
| x3::hex >> hex_qualifier
;
}
int main()
{
for (std::string const input : {
"$9", "0x1b", "0h1c", "1dh", "1ex",
"$9_f", "0x1_fb", "0h1_fc", "1_fdh", "1_fex"
}) {
ast::hex_literal parsed;
auto f = input.begin(), l = input.end();
bool r = parse(f, l, parser::hex_literal, parsed) && f==l;
std::cout << std::boolalpha
<< "r = " << r
<< ",\tresult = " << parsed
<< ",\tremaining: '" << std::string(f,l) << "'\n";
}
}
Printing
r = true, result = 9, remaining: ''
r = true, result = 27, remaining: ''
r = true, result = 28, remaining: ''
r = true, result = 29, remaining: ''
r = true, result = 30, remaining: ''
r = false, result = 9, remaining: '_f'
r = false, result = 1, remaining: '_fb'
r = false, result = 1, remaining: '_fc'
r = false, result = 1, remaining: '1_fdh'
r = false, result = 1, remaining: '1_fex'
This is where I'd start considering a custom parser. This is because it will start involving a semantic action¹ as well as multiple attribute coercions, and frankly it's most convenient to package them up so you can just write imperative C++14 like anyone else:
#include <iostream>
#include <boost/spirit/home/x3.hpp>
namespace ast {
using hex_literal = uintmax_t;
}
namespace parser {
namespace x3 = boost::spirit::x3;
struct hex_literal_type : x3::parser_base {
using attribute_type = ast::hex_literal;
template <typename It, typename Ctx, typename RCtx>
static bool parse(It& f, It l, Ctx& ctx, RCtx&, attribute_type& attr) {
std::string digits;
skip_over(f, l, ctx); // pre-skip using surrounding skipper
auto constexpr max_digits = std::numeric_limits<attribute_type>::digits / 8;
auto digits_ = x3::skip(x3::as_parser('_')) [x3::repeat(1, max_digits) [ x3::xdigit ] ];
auto qualifier = x3::omit [ x3::char_("hxHX") ];
auto forms
= ('$' >> &x3::digit | '0' >> qualifier) >> digits_
| digits_ >> qualifier
;
if (x3::parse(f, l, forms, digits)) {
attr = std::stoull(digits, nullptr, 16);
return true;
}
return false;
}
};
hex_literal_type static const hex_literal;
}
int main() {
for (std::string const input : {
"$9", "0x1b", "0h1c", "1dh", "1ex",
"$9_f", "0x1_fb", "0h1_fc", "1_fdh", "1_fex",
// edge cases
"ffffffffH", // fits
"1ffffffffH", // too big
"$00_00___01___________0__________0", // fine
"0x", // fine, same as "0h"
"$",
// upper case
"$9", "0X1B", "0H1C", "1DH", "1EX",
"$9_F", "0X1_FB", "0H1_FC", "1_FDH", "1_FEX",
}) {
ast::hex_literal parsed = 0;
auto f = input.begin(), l = input.end();
bool r = parse(f, l, parser::hex_literal, parsed) && f==l;
std::cout << std::boolalpha
<< "r = " << r
<< ",\tresult = " << parsed
<< ",\tremaining: '" << std::string(f,l) << "'\n";
}
}
Note how I included
max_digits
to avoid runaway parsing (say when the input has 10 gigabyte of hex digits). You might want improve this by preskipping insignificant0
digits.
The output is now:
r = true, result = 9, remaining: ''
r = true, result = 27, remaining: ''
r = true, result = 28, remaining: ''
r = true, result = 29, remaining: ''
r = true, result = 30, remaining: ''
r = true, result = 159, remaining: ''
r = true, result = 507, remaining: ''
r = true, result = 508, remaining: ''
r = true, result = 509, remaining: ''
r = true, result = 510, remaining: ''
r = true, result = 4294967295, remaining: ''
r = false, result = 0, remaining: '1ffffffffH'
r = true, result = 256, remaining: ''
r = true, result = 0, remaining: ''
r = false, result = 0, remaining: '$'
r = true, result = 9, remaining: ''
r = true, result = 27, remaining: ''
r = true, result = 28, remaining: ''
r = true, result = 29, remaining: ''
r = true, result = 30, remaining: ''
r = true, result = 159, remaining: ''
r = true, result = 507, remaining: ''
r = true, result = 508, remaining: ''
r = true, result = 509, remaining: ''
r = true, result = 510, remaining: ''
In case you wanted to retain the input format for roundtripping you could trivially add that to the AST now:
#include <iostream>
#include <boost/spirit/home/x3.hpp>
namespace ast {
struct hex_literal {
uintmax_t value;
std::string source;
};
}
namespace parser {
namespace x3 = boost::spirit::x3;
struct hex_literal_type : x3::parser_base {
using attribute_type = ast::hex_literal;
template <typename It, typename Ctx, typename RCtx>
static bool parse(It& f, It l, Ctx& ctx, RCtx&, attribute_type& attr) {
std::string digits;
skip_over(f, l, ctx); // pre-skip using surrounding skipper
It b = f; // save start
auto constexpr max_digits = std::numeric_limits<decltype(attr.value)>::digits / 8;
auto digits_ = x3::skip(x3::as_parser('_')) [x3::repeat(1, max_digits) [ x3::xdigit ] ];
auto qualifier = x3::omit [ x3::char_("hxHX") ];
auto forms
= ('$' >> &x3::digit | '0' >> qualifier) >> digits_
| digits_ >> qualifier
;
if (x3::parse(f, l, forms, digits)) {
attr.value = std::stoull(digits, nullptr, 16);
attr.source.assign(b,l);
return true;
}
return false;
}
};
hex_literal_type static const hex_literal;
}
int main()
{
for (std::string const input : {
"$9", "0x1b", "0h1c", "1dh", "1ex",
"$9_f", "0x1_fb", "0h1_fc", "1_fdh", "1_fex",
// edge cases
"ffffffffH", // fits
"1ffffffffH", // too big
"$00_00___01___________0__________0", // fine
"0x", // fine, same as "0h"
"$",
// upper case
"$9", "0X1B", "0H1C", "1DH", "1EX",
"$9_F", "0X1_FB", "0H1_FC", "1_FDH", "1_FEX",
}) {
ast::hex_literal parsed = {};
auto f = input.begin(), l = input.end();
bool r = parse(f, l, parser::hex_literal, parsed) && f==l;
if (r) {
std::cout << "result = " << parsed.value
<< ",\tsource = '" << parsed.source << "'\n";
}
else {
std::cout << "FAILED"
<< ",\tremaining: '" << std::string(f,l) << "'\n";
}
}
}
Prints:
result = 9, source = '$9'
result = 27, source = '0x1b'
result = 28, source = '0h1c'
result = 29, source = '1dh'
result = 30, source = '1ex'
result = 159, source = '$9_f'
result = 507, source = '0x1_fb'
result = 508, source = '0h1_fc'
result = 509, source = '1_fdh'
result = 510, source = '1_fex'
result = 4294967295, source = 'ffffffffH'
FAILED, remaining: '1ffffffffH'
result = 256, source = '$00_00___01___________0__________0'
result = 0, source = '0x'
FAILED, remaining: '$'
result = 9, source = '$9'
result = 27, source = '0X1B'
result = 28, source = '0H1C'
result = 29, source = '1DH'
result = 30, source = '1EX'
result = 159, source = '$9_F'
result = 507, source = '0X1_FB'
result = 508, source = '0H1_FC'
result = 509, source = '1_FDH'
result = 510, source = '1_FEX'