Search code examples
javascriptjison

Jison Google-Like Parser


I have written a parser using Jison that is able to handle google-like search queries with operators and boolean operation support. Currently, I am having difficulty figuring out how to accept whitespace in between the AND OR and NOT operators. Any help would be greatly appreciated and I have attached some examples of desired input/output below.

Input:

  1. true && false || true
  2. ( true ) && ( false || true )
  3. true&&false||true

Result:

1-3. ([true]&&([false]||[true]))

Code:

%lex
%%

/* Lexical Grammar */

"AND"|"&&"          { return "AND" }
"OR"|"||"           { return "OR" }
"NOT"|"!"           { return "NOT" }
"("                 { return "OPEN" }
")"                 { return "CLOSE" }
":"                 { return "QUAL" }
"-"                 { return "DASH" }
"\""|"'"            { return "QUOTE" }
\s+                 { return "SPACE" }
\w+                 { return "WORD" }
"."                 { return "DOT" }
<<EOF>>             { return "EOF" }
.                   { return "INVALID" }
/lex

/* Operators */

%right AND OR
%right NOT
%right QUAL DASH DOT

%start START

%%

/* Language Grammar */

START
    : EXP EOF
        { return $1; }
    ;

EXP
    : EXP AND EXP
        { $$ = "(" + $1 + "&&" + $3 + ")"; }
    | EXP OR EXP
        { $$ = "(" + $1 + "||" + $3 + ")"; }
    | NOT EXP
        { $$ = "(!" + $2 + ")"; }
    | OPEN EXP CLOSE
        { $$ = $2; }
    | ARGS
        { $$ = "[" + $1 + "]"; }
    ;

ARGS
    : ARG SPACE ARGS
        { $$ = [ $1 ].concat($3); }
    | OP SPACE ARGS
        { $$ = [ $1 ].concat($3); }
    | ARG
        { $$ = [ $1 ]; }
    | OP
        { $$ = [ $1 ]; }
    ;

OP
    : DASH OP
        { $$ = "-" + $2; }
    | ARG QUAL ARG
        { $$ = $1 + ":" + $3; }
    ;

ARG
    : DASH ARG
        { $$ = "-" + $2; }
    | QUOTE TERMS QUOTE
        { $$ = $2.join(" "); }
    | TERM
        { $$ = $1; }
    ;

TERMS
    : TERM SPACE TERMS
        { $$ = [ $1 ].concat($3); }
    | TERM
        { $$ = [ $1 ]; }
    ;

TERM
    : TERM DASH TERM
        { $$ = $1 + $2 + $3; }
    | TERM DOT TERM
        { $$ = $1 + $2 + $3; }
    | WORD
        { $$ = $1; }
    ;

Solution

  • Figured it out. I started ignoring the whitespace, changed a few rules, and resolved the conflicts. The parser returns a function that is used to determine if some object matches the query. Here is the final result:

    /* Google-Like Parser */
    
    /* Lexical Grammar */
    
    %lex
    %%
    
    \s+                   { /* ignore whitespace */ }
    "AND"|"&&"            { return "AND" }
    "OR"|"||"             { return "OR" }
    "NOT"|"!"             { return "NOT" }
    "("                   { return "OPEN" }
    ")"                   { return "CLOSE" }
    ":"                   { return "QUAL" }
    "-"                   { return "NEG" }
    "\""|"'"              { return "QUOTE" }
    \w+                   { return "WORD" }
    "."                   { return "DOT" }
    <<EOF>>               { return "EOF" }
    .                     { return "INVALID" }
    
    /lex
    
    /* Operators */
    
    %right AND OR
    %right NOT
    %right QUAL NEG DOT
    
    %start START
    
    %%
    
    /* Language Grammar */
    
    START
        : EXP EOF
            { return $1; }
        ;
    
    EXP
        : EXP AND EXP
            { $$ = function(obj) { return ($1(obj) && $3(obj)); }; }
        | EXP OR EXP
            { $$ = function(obj) { return ($1(obj) || $3(obj)); }; }
        | NOT EXP
            { $$ = function(obj) { return !($2(obj)); }; }
        | OPEN EXP CLOSE
            { $$ = $2; }
        | ARGS
            { $$ = function(obj) { return parser.processArgs(obj, $1)(obj); }; }
        ;
    
    ARGS
        : ARG ARGS
            { $$ = [ $1, $2]; }
        | OP ARGS
            { $$ = [ $1, $2]; }
        | ARG
            { $$ = [ $1 ]; }
        | OP
            { $$ = [ $1 ]; }
        ;
    
    OP
        : NEG ARG
            {{
                $2.not = true;
                $$ = $2;
            }}
        | NEG ARG QUAL ARG
            {{
                $$ = {
                    "not": true,
                    "operator": $2.operand,
                    "operand": $4.operand
                };
            }}
        | ARG QUAL ARG
            {{
                $$ = {
                    "not": false,
                    "operator": $1.operand,
                    "operand": $3.operand
                };
            }}
        ;
    
    ARG
        : QUOTE TERMS QUOTE
            {{
                $$ = {
                    "not": false,
                    "operator": null,
                    "operand": $2.join(" ")
                };
            }}
        | TERM
            {{
                $$ = {
                    "not": false,
                    "operator": null,
                    "operand": $1
                };
            }}
        ;
    
    TERMS
        : TERM TERMS
            { $$ = [ $1 ].concat($2); }
        | TERM
            { $$ = [ $1 ]; }
        ;
    
    TERM
        : WORD DOT TERM
            { $$ = $1 + $2 + $3; }
        | WORD
            { $$ = $1; }
        ;
    
    %%
    
    parser.processArgs = function(obj, args) {
        if (args.length > 1)
        {
            if (args[0].operator)
                return function(obj) { return (parser.matchArg(obj, args[0]) && parser.processArgs(args[1])(obj)); };
            else
                return function(obj) { return (parser.matchArg(obj, args[0]) || parser.processArgs(args[1])(obj)); };
        }
        else
        {
            return function(obj) { return parser.matchArg(obj, args[0]); };
        }
    }
    
    /* Override Later */
    parser.matchArg = function(obj, arg) {
        return true;
    }