Search code examples
c++regexboost

boost regex to match IF statements


I need to write a boost regex to match the following string and seperate it into three tokens depending on the parameters to the IF block

=IF(ISNUMBER(SEARCH("Windows",GETWORKSPACE(1))),ON.TIME(NOW()+"00:00:02","abcdef"),CLOSE(TRUE))

Ideally these should come to

token1 = "ISNUMBER(SEARCH("Windows",GETWORKSPACE(1)))"
token2 = "ON.TIME(NOW()+"00:00:02","abcdef")"
token3 = "CLOSE(TRUE)"

I had originally written a simple regex as "(?<=\=IF\()(.),(.),(.*)(?=\))" which gives out incorrect tokens because the greedy qualifier takes too much of the first token. I am currently getting

token1 =     "ISNUMBER(SEARCH("Windows",GETWORKSPACE(1))),ON.TIME(NOW()+"00:00:02""
token2 =     ""abcdef")"
token3 =     "CLOSE(TRUE)"

Also tried "(?<=\\=IF\\()([A-Za-z(),:\"]*?),([A-Za-z(),.:\"]*?),([A-Z(),:\"]*?)(?=\\))" with no luck. Can someone please suggest a regex ?


Solution

  • You need a simple parser.

    Here's one with my favorite Boost swiss-army knife for quick parsers.

    I've created a very flexible "token" grammar that honours (nested) parentheses and double-quoted string literals (potentially with embedded escaped quotes and parentheses):

    token = raw [ *(
          '(' >> -token_list >> ')'
        | '[' >> -token_list >> ']'
        | '{' >> -token_list >> '}'
        | string_literal
        | lexeme[ + ~char_(")]}([{\"',") ]
        ) ];
    

    Where token_list and string_literal are defined as

    string_literal = lexeme [
        '"' >> *('\\' >> char_ | ~char_('"')) >> '"'
    ];
    
    token_list = token % ',';
    

    Now the parser expression for an =IF(condition, true_part, false_part) is simply:

    if_expr
        = '=' >> no_case["if"] 
        >> '(' >> token >> ',' >> token >> ',' >> token >> ')';
    

    For fun I made the IF keyword case-insensitive

    DEMO

    Live On Coliru

    //#define BOOST_SPIRIT_X3_DEBUG
    #include <boost/spirit/home/x3.hpp>
    #include <boost/fusion/adapted/std_tuple.hpp>
    #include <iostream>
    #include <iomanip>
    namespace x3 = boost::spirit::x3;
    
    namespace parser {
        using namespace x3;
    
        static rule<struct token_, std::string> const token = "token";
    
        static auto const string_literal = lexeme [
            '"' >> *('\\' >> char_ | ~char_('"')) >> '"'
        ];
    
        static auto const token_list = token % ',';
    
        static auto const token_def = raw [ *(
              '(' >> -token_list >> ')'
            | '[' >> -token_list >> ']'
            | '{' >> -token_list >> '}'
            | string_literal
            | +~char_(")]}([{\"',")  // glue together everything else
            ) ];
    
        BOOST_SPIRIT_DEFINE(token)
    
        static auto const if_expr
            = '=' >> no_case["if"] 
            >> '(' >> token >> ',' >> token >> ',' >> token >> ')';
    }
    
    int main() {
        for (std::string const& input : {
                R"(=IF(ISNUMBER,ON.TIME,CLOSE))",
                R"(=IF(ISNUMBER(SEARCH("Windows")),ON.TIME(NOW()+"00:00:02","abcdef"),CLOSE(TRUE)))",
                R"(=IF(ISNUMBER(SEARCH("Windows",GETWORKSPACE(1))),ON.TIME(NOW()+"00:00:02","abcdef"),CLOSE(TRUE)))",
                " = if( isnumber, on .time, close ) ",
                R"( = if( "foo, bar", if( isnumber, on .time, close ), IF("[ISN(UM}B\"ER")) )",
            })
        {
            auto f = input.begin(), l = input.end();
            std::cout << "=== " << std::quoted(input) << ":\n";
    
            std::string condition, true_part, false_part;
            auto attr = std::tie(condition, true_part, false_part);
    
            if (phrase_parse(f, l, parser::if_expr, x3::blank, attr)) {
                std::cout << "Parsed: \n"
                   << " - condition: " << std::quoted(condition) << "\n"
                   << " - true_part: " << std::quoted(true_part) << "\n"
                   << " - false_part: " << std::quoted(false_part) << "\n";
            } else {
                std::cout << "Parse failed\n";
            }
    
            if (f!=l) {
                std::cout << "Remaining unparsed: " << std::quoted(std::string(f,l)) << "\n";
            }
        }
    }
    

    Prints

    === "=IF(ISNUMBER,ON.TIME,CLOSE)":
    Parsed: 
     - condition: "ISNUMBER"
     - true_part: "ON.TIME"
     - false_part: "CLOSE"
    === "=IF(ISNUMBER(SEARCH(\"Windows\")),ON.TIME(NOW()+\"00:00:02\",\"abcdef\"),CLOSE(TRUE))":
    Parsed: 
     - condition: "ISNUMBER(SEARCH(\"Windows\"))"
     - true_part: "ON.TIME(NOW()+\"00:00:02\",\"abcdef\")"
     - false_part: "CLOSE(TRUE)"
    === "=IF(ISNUMBER(SEARCH(\"Windows\",GETWORKSPACE(1))),ON.TIME(NOW()+\"00:00:02\",\"abcdef\"),CLOSE(TRUE))":
    Parsed: 
     - condition: "ISNUMBER(SEARCH(\"Windows\",GETWORKSPACE(1)))"
     - true_part: "ON.TIME(NOW()+\"00:00:02\",\"abcdef\")"
     - false_part: "CLOSE(TRUE)"
    === " = if( isnumber, on .time, close ) ":
    Parsed: 
     - condition: "isnumber"
     - true_part: "on .time"
     - false_part: "close "
    === " = if( \"foo, bar\", if( isnumber, on .time, close ), IF(\"[ISN(UM}B\\\"ER\")) ":
    Parsed: 
     - condition: "\"foo, bar\""
     - true_part: "if( isnumber, on .time, close )"
     - false_part: "IF(\"[ISN(UM}B\\\"ER\")"