Search code examples
antlrantlr4

ANTLR 4 parser grammar has trouble with if expression


I've written a simple grammar for a language meant to be used in graph-based dialogue systems (primarily for video games).

Here are the grammars:

parser grammar DialogueScriptParser;

options {
    tokenVocab = DialogueScriptLexer;
}

// Entry Point
script: scheduled_block* EOF;

// Scheduled Blocks
scheduled_block:
    scheduled_block_open block scheduled_block_close;
scheduled_block_open: LT flag_list? LT;
scheduled_block_close: GT flag_list? GT;
block: statement*;

// Statements 
statement:
    if_statement
    | switch_statement
    | compound_statement
    | expression_statement
    | declaration_statement;

// Compound Statement
compound_statement: LBRACE statement_list? RBRACE;
statement_list: statement+;

// Expression Statement
expression_statement: expression SEMI;

// If Statement
if_statement:
    IF LPAREN expression RPAREN statement (ELSE statement)?;

// Switch Statement
switch_statement: SWITCH LPAREN expression LPAREN switch_block;
switch_block: LBRACE switch_label* RBRACE;
switch_label: CASE expression COLON | DEFAULT COLON;

// Declaration Statement
declaration_statement: type declarator_init SEMI;
declarator_init: declarator (ASSIGN expression)?;
declarator: IDENTIFIER;

// Expression
expression_list: expression (COMMA expression);
expression:
    name
    | literal
    | LPAREN expression RPAREN
    | expression (INC | DEC)
    | expression LBRACK expression RBRACK
    | expression LPAREN expression_list? RPAREN
    | expression LBRACE expression_list? RBRACE
    | (SUB | ADD | INC | DEC | NOT | BIT_NOT) expression
    | expression TURNARY expression COLON expression
    | expression mul_div_mod_operator expression
    | expression add_sub_operator expression
    | <assoc = right> expression concat_operator expression
    | expression relational_operator expression
    | expression and_operator expression
    | expression or_operator expression
    | expression bitwise_operator expression;

// Operators
concat_operator: CONCAT;
and_operator: AND;
or_operator: OR;
add_sub_operator: ADD | SUB;
mul_div_mod_operator: MUL | DIV | MOD;
relational_operator: GT | LT | LE | GE;
equality_operator: EQUAL | NOTEQUAL;
bitwise_operator:
    | '<' '<'
    | '>' '>'
    | BIT_AND
    | BIT_OR
    | BIT_XOR;
assignment_operator:
    ASSIGN
    | ADD_ASSIGN
    | SUB_ASSIGN
    | MUL_ASSIGN
    | DIV_ASSIGN
    | AND_ASSIGN
    | OR_ASSIGN
    | XOR_ASSIGN
    | MOD_ASSIGN
    | LSHIFT_ASSIGN
    | RSHIFT_ASSIGN;

// Types
type:
    primitive_type
    | type LBRACK RBRACK
    | name ('<' type (COMMA type)* '>')?;

primitive_type:
    TYPE_BOOLEAN
    | TYPE_CHAR
    | TYPE_FLOAT_DEFAULT
    | TYPE_FLOAT32
    | TYPE_FLOAT64
    | TYPE_INT_DEFAULT
    | TYPE_INT8
    | TYPE_INT16
    | TYPE_INT32
    | TYPE_INT64
    | TYPE_UINT_DEFAULT
    | TYPE_UINT8
    | TYPE_UINT16
    | TYPE_UINT32
    | TYPE_UINT64
    | TYPE_STRING;

// Name
name: namespace? IDENTIFIER (DOT IDENTIFIER)*;

// Namespace
namespace: IDENTIFIER COLONCOLON (namespace)*;

// Flags 
flag_list: IDENTIFIER (COMMA IDENTIFIER)*;

// Literals
literal:
    INTEGER_LITERAL
    | FLOATING_POINT_LITERAL
    | BOOLEAN_LITERAL
    | CHARACTER_LITERAL
    | STRING_LITERAL
    | NULL_LITERAL;

and:

lexer grammar DialogueScriptLexer;

// Keywords
TYPE_BOOLEAN: 'bool';
TYPE_CHAR: 'char'; // 16 bits
TYPE_FLOAT_DEFAULT: 'float'; // 32 bits
TYPE_FLOAT32: 'float32';
TYPE_FLOAT64: 'float64';
TYPE_INT_DEFAULT: 'int'; // 32 bits
TYPE_INT8: 'int8';
TYPE_INT16: 'int16';
TYPE_INT32: 'int32';
TYPE_INT64: 'int64';
TYPE_UINT_DEFAULT: 'uint'; // 32 bits
TYPE_UINT8: 'uint8';
TYPE_UINT16: 'uint16';
TYPE_UINT32: 'uint32';
TYPE_UINT64: 'uint64';
TYPE_STRING: 'string'; // 16 bits per character

BREAK: 'break'; // used for switch
CASE: 'case'; // used for switch
DEFAULT: 'default'; // used for switch
IF: 'if';
ELSE: 'else';
SWITCH: 'switch';

// Integer Literals
INTEGER_LITERAL:
    DecIntegerLiteral
    | HexIntegerLiteral
    | OctalIntegerLiteral
    | BinaryIntegerLiteral;

fragment DecIntegerLiteral: '0' | NonZeroDigit Digit*;
fragment HexIntegerLiteral: '0' [xX] HexDigit+;
fragment OctalIntegerLiteral: '0' Digit+;
fragment BinaryIntegerLiteral: '0' [bB] BinaryDigit+;

// Floating-Point Literals
FLOATING_POINT_LITERAL: DecFloatingPointLiteral;

fragment DecFloatingPointLiteral:
    DecIntegerLiteral? ('.' Digits) FloatTypeSuffix?
    | DecIntegerLiteral FloatTypeSuffix?;

// Boolean Literals
BOOLEAN_LITERAL: 'true' | 'false';

// Character Literals
CHARACTER_LITERAL:
    '\'' SingleCharacter '\''
    | '\'' EscapeSequence '\'';

fragment SingleCharacter: ~['\\\r\n];
fragment EscapeSequence:
    '\\\''
    | '\\"'
    | '\\\\'
    | '\\0'
    | '\\a'
    | '\\b'
    | '\\f'
    | '\\n'
    | '\\r'
    | '\\t'
    | '\\v';

// String Literals
STRING_LITERAL: '"' StringCharacters? '"';
fragment StringCharacters: StringCharacter+;
fragment StringCharacter: ~["\\\r\n] | EscapeSequence;

// Null Literal
NULL_LITERAL: 'null';

// Separators
LPAREN: '(';
RPAREN: ')';
LBRACE: '{';
RBRACE: '}';
LBRACK: '[';
RBRACK: ']';
SEMI: ';';
COMMA: ',';
DOT: '.';
COLON: ':';
COLONCOLON: '::';

// Operators
ASSIGN: '=';
ADD_ASSIGN: '+=';
SUB_ASSIGN: '-=';
MUL_ASSIGN: '*=';
DIV_ASSIGN: '/=';
AND_ASSIGN: '&=';
OR_ASSIGN: '|=';
XOR_ASSIGN: '^=';
MOD_ASSIGN: '%=';
LSHIFT_ASSIGN: '<<=';
RSHIFT_ASSIGN: '>>=';

GT: '>';
LT: '<';
EQUAL: '==';
LE: '<=';
GE: '>=';
NOTEQUAL: '!=';
NOT: '!';

BIT_NOT: '~';
BIT_AND: '&';
BIT_OR: '|';
BIT_XOR: '^';
/* Defining these here make recognizing scheduled blocks difficult BIT_SHIFT_L: '<<'; BIT_SHIFT_R:
 '>>';
 */

AND: '&&';
OR: '||';

INC: '++';
DEC: '--';

ADD: '+';
SUB: '-';
MUL: '*';
DIV: '/';
MOD: '%';
CONCAT: '..';

TURNARY: '?';

// Identifiers
/* Order affects precedence IDENTFIER must come last. */
IDENTIFIER: Letter LetterOrDigit*;

fragment LetterOrDigit: Letter | Digit;
fragment Digits: Digit+;
fragment Digit: '0' | NonZeroDigit;
fragment NonZeroDigit: [1-9];
fragment HexDigit: [0-9a-fA-F];
fragment BinaryDigit: [01];
fragment Letter: [a-zA-Z_];
fragment FloatTypeSuffix: [fFdD];

// Whitespace and Comments
WHITESPACE: [ \t\r\n\u000C]+ -> skip;
COMMENT_BLOCK: '/*' .*? '*/' -> channel(HIDDEN);
COMMENT_LINE: '//' ~[\r\n]* -> channel(HIDDEN);

Project: https://github.com/Sahasrara/DialogueScript

The grammar is working for the most part, but it's struggling to parse certain types of expressions.

Example:

<<
    if (intVar == 10 && globalFunc() || "string lit" .. "concat string" == stringVar) 
    {
        anotherFunc();
    }
>>

Here's the output tree: enter image description here

I know there's a precedence issue here, but I'm not entirely sure how to resolve it. Would someone mind pointing me in the right direction?


Solution

  • You are not using the equality_operator rule that contains the == operator. Place it somewhere in your expression rule:

    expression
        : ...
        | expression add_sub_operator expression
        | expression equality_operator expression
        | <assoc = right> expression concat_operator expression
        | ...
        ;
    

    When placed there, it will have a lower precedence than + and -, and a higher precedence than ..:

    enter image description here

    Also note that the assignment_operator is not used currently.