Search code examples
c++boost-spiritboost-spirit-qiwstring

boost spirit parsing quote string fails


This is my grammer

unesc_char.add(L"\\a", L'\a')(L"\\b", L'\b')(L"\\f", L'\f')(L"\\n", L'\n')
              (L"\\r", L'\r')(L"\\t", L'\t')(L"\\v", L'\v')(L"\\\\", L'\\')
              (L"\\\'", L'\'')(L"\\\"", L'\"');
unesc_str = '\"' >> *((boost::spirit::standard_wide::char_ - '\"') | unesc_char) >> '\"';

with

qi::rule<Iterator, std::wstring()> unesc_str;
qi::symbols<wchar_t const, wchar_t const> unesc_char;

Parsing fails on : "Hello\"" -> should return Hello"
Parsing correct on : "Hello\\" -> should return Hello\

changing the rule to

unesc_str = '\"' >> *(unesc_char | (boost::spirit::standard_wide::char_ - '\"')) >> '\"';

Parsing correnct on : "Hello\"" -> should return Hello"
Parsing fails on : "Hello\\" -> should return Hello\

how to get both running ?


Solution

  • PEG grammars parse left-to-right, so you need to have unesc_char in front, to handle escapes.

    Furthermore I think you're probably confusing yourself with the levels of input escaping:

    Live On Coliru

    #include <boost/spirit/include/qi.hpp>
    
    namespace qi = boost::spirit::qi;
    
    template <typename It>
    struct Parser : qi::grammar<It, std::wstring()> {
        Parser() : Parser::base_type(unesc_str) {
            unesc_char.add
                (L"\\a",  L'\a')
                (L"\\b",  L'\b')
                (L"\\f",  L'\f')
                (L"\\n",  L'\n')
                (L"\\r",  L'\r')
                (L"\\t",  L'\t')
                (L"\\v",  L'\v')
                (L"\\\\", L'\\')
                (L"\\'",  L'\'')
                (L"\\\"", L'\"');
    
            unesc_str = L'"' >> *(unesc_char | ~qi::standard_wide::char_(L'"')) >> L'"';
        }
      private:
        qi::rule<It, std::wstring()> unesc_str;
        qi::symbols<wchar_t const, wchar_t const> unesc_char;
    };
    
    int main() {
        using It = std::wstring::const_iterator;
        Parser<It> const p {};
        for (std::wstring const input : { 
                L"\"abaca\\tdabra\"",
                LR"("Hello\"")", L"\"Hello\\\"\"", // equivalent
                LR"("Hello\\")", L"\"Hello\\\\\"", 
        }) {
            It f = input.begin(), l = input.end();
            std::wstring s;
            if (parse(f, l, p, s)) {
                std::wcout << L"Unescape: " << input << L" -> " << s << L"\n";
            }
    
            if (f != l)
                std::wcout << "Remaining input: '" << std::wstring(f,l) << "'\n";
    
        }
    }
    

    Prints

    Unescape: "abaca\tdabra" -> abaca   dabra
    Unescape: "Hello\"" -> Hello"
    Unescape: "Hello\"" -> Hello"
    Unescape: "Hello\\" -> Hello\
    Unescape: "Hello\\" -> Hello\
    

    BONUS

    I'd probably uncomplicate without the symbols. This is more flexible and probably more efficient unless you need a dynamic list of escapes:

    Live On Coliru

    namespace enc = qi::standard_wide;
    
    unesc_str = '"' >> *(
            '\\' >> (
                'a' >> qi::attr('\a')
              | 'b' >> qi::attr('\b')
              | 'f' >> qi::attr('\f')
              | 'n' >> qi::attr('\n')
              | 'r' >> qi::attr('\r')
              | 't' >> qi::attr('\t')
              | 'v' >> qi::attr('\v')
              | enc::char_
          ) | ~enc::char_('"')) >> '"';