Search code examples
javascriptpegjs

Pegjs reserved keyword


I got a grammar supporting this:
AND, OR, NOT, ( and ), ", '

Few samples of things I need to be able parse:

  • a1 OR a2
  • a1 a2 (same as above)
  • a1 AND a2
  • "a1" AND 'a2'
  • a1 OR a2 NOT a3
  • a1 a2 OR "a3" NOT(a1 AND a2 'a6')

Considering a1, a2, and so on are real user input that could include almost anything like:

  • 123
  • Tesla-S
  • origami

The problem I got, is when one of those word are without quotes, and starts with some reserved keyword, for example:

  • origami
  • andromede

In this case, this parser consider:

  • or keyword + igami text
  • and keyword + romede text

That's the problem I got.

I've tried for two days various solutions, found on stackoverflow (and official doc):

(and many other) trying to find a solution with those constraint:

  • it's not guarantee that there is space before/after keyword, for example "a1 AND(a2 OR a3)" is valid (no space between "AND" and "("), so is "(a1 AND a2)OR a3" => a keyword could have before/after it, a space and/or "("/")" (but when I try this I break the parenthesis rule)
  • a word is NOT a word only if it's part of the reserved section: "AND"i / "OR"i / "NOT"i / "(" / ")" / "'" / '"' / " " => any of those are not a word, anything else is, like ando is a word, not a keyword.

Here is the code I came up with:

content = andOperator

andOperator
    = head:orOperator tail:(_ "AND"i _ orOperator)* {
        return tail.reduce(function(result, element) {
            return {
                type: "and",
                value: {
                    left: result,
                    right: element[3]
                }
            };
        }, head);
    }

orOperator
    = head:notOperator tail:(_ ("OR"i / _) _ notOperator)* {
        return tail.reduce(function(result, element) {
            return {
                type: "or",
                value: {
                    left: result,
                    right: element[3]
                }
            };
        }, head);
    }

notOperator
    = head:parenthesis tail:(_ ("AND"i / "OR" / _) _ "NOT"i _ parenthesis)* {
        return tail.reduce(function(result, element) {
            var type = (element[1] && element[1].toLowerCase() === "or") ? "or" : "and";
            return {
                type: type,
                value: {
                    left: result,
                    right: {
                        type: "not",
                        value: element[5]
                    }
                }
            };
        }, head);
    }

parenthesis "Parenthesis"
    = _ "(" _ inside:content+ _ ")" _ {
        return {
            type: "parenthesis",
            value: (Array.isArray(inside) && inside.length === 1) ? inside[0] : inside
        };
    } / text

/*
-----------------------------
  TEXT
-----------------------------
*/

text "Text"
    = _ inside:(singleQuoteText / doubleQuoteText / noQuoteText)+ _ {
        return (Array.isArray(inside) && inside.length === 1) ? inside[0] : inside;
    }

singleQuoteText "Single Quote Text"
    = "'" text:$([^\']+) "'" {
        return {
            type: "text",
            value: text ? text.trim(): text
        };
    }

doubleQuoteText "Double Quote Text"
    = '"' text:$([^\"]+) '"' {
        return {
            type: "text",
            value: text ? text.trim(): text
        };
    }

noQuoteText "No Quote Text"
    = text:$(!reserved .)+ {
        return {
            type: "text",
            value: text ? text.trim(): text
        };
    }

reserved "List of keyword this grammar allow"
    = ("AND"i / "OR"i / "NOT"i / "(" / ")" / "'" / '"' / " ")

/*
-----------------------------
  WHITESPACE PARSING
-----------------------------
*/
__ "Mandatory Whitespace"
    = $(whitespace+)

_ "Optional Whitespace"
    = __?

whitespace
    = [\u0009\u000B\u000C\u0020\u00A0\uFEFF\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] / $('\r\n' / '\n')

Example of the problem: hello origami

Gives with the current grammar:

{
   "type": "or",
   "value": {
      "left": {
         "type": "text",
         "value": "hello"
      },
      "right": {
         "type": "text",
         "value": "igami"
      }
   }
}

Should gives (it has consider origami as a full world and not or + igami):

{
   "type": "or",
   "value": {
      "left": {
         "type": "text",
         "value": "hello"
      },
      "right": {
         "type": "text",
         "value": "origami"
      }
   }
}

Origami in the current parser is splitted as OR + igami, while it should have consider the whole word origami...


Solution

  • Using a predicate, you can include a rule that matches all words except your keywords, like so:

    {
    
       var keywords = ["and", "or"];
    
    }
    
    Expression =
        word:$(Word) { return { word: word } } /
        keyword:$(Keyword) { return { keyword: keyword } }
    
    // Word will match everything except "and" and "or",
    // including words like "origami" and "andromede"
    Word = word:$([a-zA-Z]+) &{ return !keywords.includes(word) }
    
    Keyword = [a-zA-Z]+
    

    In the above grammar, Word will match all words except "or" and "and". If the word (then entire word) is one of those keywords, then the Keyword rule will match instead.

    So, given the input and, you will get the following output:

    {
        keyword: "and"
    }
    

    But given the input andromede, you will get the following output:

    {
        word: "andromede"
    }