Search code examples
c++boost-spiritboost-spirit-x3

Boost Spirit X3: Parsing (some) whitespace into an enum


I have a parser in which I want to capture certain types of whitespace as enum values and preserve the spaces for the "text" values.

My whitespace parser is pretty basic (Note: I've only added the pipe character here for test/dev purposes):

struct whitespace_p : x3::symbols<Whitespace>
{
    whitespace_p()
    {
        add
        ("\n", Whitespace::NEWLINE)
        ("\t", Whitespace::TAB)
        ("|", Whitespace::PIPE)
        ;
    }
} whitespace;

And I want to capture everything either into my enum or into std::strings:

struct Element : x3::variant<Whitespace, std::string>
{
    using base_type::base_type;
    using base_type::operator=;
};

And to parse my input I use something like this:

const auto contentParser
    = x3::rule<class ContentParserID, Element, true> { "contentParser" }
    = x3::no_skip[+(x3::char_ - (whitespace))]
        | whitespace
    ;

using Elements = std::vector<Element>;
const auto elementsParser
    = x3::rule<class ContentParserID, Elements, true> { "elementsParser" }
    = contentParser >> *(contentParser);

The problem though is that the parser stops at the first tab or newline it hits.

Code: http://coliru.stacked-crooked.com/a/d2cda4ce721279a4

#include <boost/spirit/home/x3.hpp>
#include <boost/spirit/home/x3/support/ast/variant.hpp>
#include <iostream>

namespace x3 = boost::spirit::x3;

enum Whitespace
{
    NEWLINE,
    TAB,
    PIPE
};

struct whitespace_p : x3::symbols<Whitespace>
{
    whitespace_p()
    {
        add
        ("\n", Whitespace::NEWLINE)
        ("\t", Whitespace::TAB)
        ("|", Whitespace::PIPE)
        ;
    }
} whitespace;

struct Element : x3::variant<Whitespace, std::string>
{
    using base_type::base_type;
    using base_type::operator=;
};

const auto contentParser
    = x3::rule<class ContentParserID, Element, true> { "contentParser" }
    = x3::no_skip[+(x3::char_ - (whitespace))]
        | whitespace
    ;

using Elements = std::vector<Element>;
const auto elementsParser
    = x3::rule<class ContentParserID, Elements, true> { "elementsParser" }
    = contentParser >> *(contentParser);

struct print_visitor
    : public boost::static_visitor<std::string>
{
    std::string operator()(const Whitespace& ws) const
    {
        if (ws == Whitespace::NEWLINE) 
        {
            return "newline";
        }
        else if (ws == Whitespace::PIPE)
        {
            return "pipe";
        }
        else
        {
            return "tab";
        }
    }

    std::string operator()(const std::string& str) const
    {
        return str;
    }
};

int main() 
{
    const std::string text = "Hello \n World";
    std::string::const_iterator start = std::begin(text);
    const std::string::const_iterator stop = std::end(text);

    Elements elements{};

    bool result =
        phrase_parse(start, stop, elementsParser, x3::ascii::space, elements);

    if (!result) 
    {
        std::cout << "failed to parse!\n";
    } 
    else if (start != stop)
    {
        std::cout << "unparsed: " << std::string{start, stop} << '\n';
    }
    else
    {
        for (const auto& e : elements)
        {
            std::cout << "element: [" << boost::apply_visitor(print_visitor{}, e) << "]\n";
        }
    }
}

If I parse the text Hello | World then I get the results I'm expecting. But if I instead use Hello \n World the whitespace after the \n is swallowed and the World is never parsed. Ideally I'd like to see this output:

element: [Hello ]
element: [newline]
element: [ World]

How can I accomplish this? Thank you!


Solution

  • My goto reference on skipper issues: Boost spirit skipper issues

    In this case you made it work with no_skip[]. That's correct.

    no_skip is like lexeme except it doesn't pre-skip, from the source (boost/spirit/home/x3/directive/no_skip.hpp):

    // same as lexeme[], but does not pre-skip
    

    Alternative Take

    In your case I would flip the logic: just adjust the skipper itself.

    Also, don't supply the skipper with phrase_parse, because your grammar is highly sensitive to the correct value of the skipper.

    Your whole grammar could be:

    const auto p  = x3::skip(x3::space - whitespace) [
            *(+x3::graph | whitespace)
        ];
    

    Here's a Live Demo On Coliru

    #include <boost/spirit/home/x3.hpp>
    #include <boost/spirit/home/x3/support/ast/variant.hpp>
    #include <iostream>
    #include <iomanip>
    
    namespace x3 = boost::spirit::x3;
    
    enum Whitespace { NEWLINE, TAB, PIPE };
    
    struct whitespace_p : x3::symbols<Whitespace> {
        whitespace_p() {
            add
                ("\n", Whitespace::NEWLINE)
                ("\t", Whitespace::TAB)
                ("|", Whitespace::PIPE)
            ;
        }
    } static const whitespace;
    
    struct Element : x3::variant<Whitespace, std::string> {
        using base_type::base_type;
        using base_type::operator=;
    };
    
    using Elements = std::vector<Element>;
    
    static inline std::ostream& operator<<(std::ostream& os, Element const& el) {
        struct print_visitor {
            std::ostream& os;
    
            auto& operator()(Whitespace ws) const {
                switch(ws) {
                    case Whitespace::NEWLINE: return os << "[newline]";
                    case Whitespace::PIPE: return os << "[pipe]";
                    case Whitespace::TAB: return os << "[tab]";
                }
                return os << "?";
            }
    
            auto& operator()(const std::string& str) const { return os << std::quoted(str); }
        } vis{os};
        return boost::apply_visitor(vis, el);
    }
    
    int main() {
        std::string const text = "\tHello \n World";
        auto start = begin(text), stop = end(text);
    
        const auto p  = x3::skip(x3::space - whitespace) [
                *(+x3::graph | whitespace)
            ];
    
        Elements elements;
    
        if (!parse(start, stop, p, elements)) {
            std::cout << "failed to parse!\n";
        } else {
            std::copy(begin(elements), end(elements), std::ostream_iterator<Element>(std::cout, "\n"));
        }
    
        if (start != stop) {
            std::cout << "unparsed: " << std::quoted(std::string(start, stop)) << '\n';
        }
    }
    

    Prints

    [tab]
    "Hello"
    [newline]
    "World"
    

    Even Simpler?

    It doesn't seem like you'd need any skipper here at all. Why not:

    const auto p  = *(+~x3::char_("\n\t|") | whitespace);
    

    While we're at it, there's no need for symbols to map enums:

    struct Element : x3::variant<char, std::string> {
        // ...
    };
    using Elements = std::vector<Element>;
    

    And then

    const auto p
        = x3::rule<struct ID, Element> {}
        = +~x3::char_("\n\t|") | x3::char_;
    

    Live On Coliru

    #include <boost/spirit/home/x3.hpp>
    #include <boost/spirit/home/x3/support/ast/variant.hpp>
    #include <iostream>
    #include <iomanip>
    
    namespace x3 = boost::spirit::x3;
    
    struct Element : x3::variant<char, std::string> {
        using variant = x3::variant<char, std::string>;
        using variant::variant;
        using variant::operator=;
    
        friend std::ostream& operator<<(std::ostream& os, Element const& el) {
            struct print_visitor {
                std::ostream& os;
    
                auto& operator()(char ws) const {
                    switch(ws) {
                        case '\n': return os << "[newline]";
                        case '\t': return os << "[pipe]";
                        case '|': return os << "[tab]";
                    }
                    return os << "?";
                }
    
                auto& operator()(const std::string& str) const { return os << std::quoted(str); }
            } vis{os};
            return boost::apply_visitor(vis, el);
        }
    };
    using Elements = std::vector<Element>;
    
    int main() {
        std::string const text = "\tHello \n World";
        auto start = begin(text);
        auto const stop = end(text);
    
        Elements elements;
        const auto p
            = x3::rule<struct ID, Element> {}
            = +~x3::char_("\n\t|") | x3::char_;
    
        if (!parse(start, stop, *p, elements)) {
            std::cout << "failed to parse!\n";
        } else {
            std::copy(begin(elements), end(elements), std::ostream_iterator<Element>(std::cout, "\n"));
        }
    
        if (start != stop) {
            std::cout << "unparsed: " << std::quoted(std::string(start, stop)) << '\n';
        }
    }
    

    Prints

    [pipe]
    "Hello "
    [newline]
    " World"