Search code examples
pythonparsingyacclex

Variable name starts with "false" and is parsed as constant


I'm trying to write boolean expression parser with variables using sly as a lexer and parser library. I'm trying to define case insensitive constants "true" and "false" but have some issues with variable names which starts from that constants name. For example variable "falseAlarm" is parsed as "false" constant and "Alarm" variable so I get a syntax error. I am not very good at parsers so I really have no idea hot to make it right.

Here is my code:

from sly import Lexer, Parser
from dataclasses import dataclass, field
from typing import List
from pprint import pprint

import re


class Lex(Lexer):
    tokens = {
        LB,
        RB,
        AND,
        OR,
        NOT,
        TRUE,
        FALSE,
        ID,
    }

    ignore = ' \t'
    ignore_newline = r'\n+'

    LB = r'\('
    RB = r'\)'
    AND = r'\&\&'
    OR = r'\|\|'
    NOT = r'(?i)not'
    TRUE = r'(?i)true'
    FALSE = r'(?i)false'
    ID = r'[a-zA-Z][a-zA-Z0-9_]*'


class Pax(Parser):
    debugfile = 'parser.out'

    tokens = Lex.tokens

    @_('boolean_expression boolean_operator boolean_term')
    def boolean_expression(self, p):
        return (p.boolean_operator, [p.boolean_expression, p.boolean_term])

    @_('boolean_term')
    def boolean_expression(self, p):
        return [p.boolean_term]

    @_('AND')
    def boolean_operator(self, p):
        return p.AND

    @_('OR')
    def boolean_operator(self, p):
        return p.OR

    @_('LB boolean_expression RB')
    def boolean_term(self, p):
        return p.boolean_expression

    @_('NOT boolean_term')
    def boolean_term(self, p):
        return ('not', [p.boolean_term])

    @_('boolean_constant')
    def boolean_term(self, p):
        return p.boolean_constant

    @_('ID')
    def boolean_term(self, p):
        return ('variable', p.ID)

    @_('TRUE')
    @_('FALSE')
    def boolean_constant(self, p):
        return ('constant', p)


    def error(self, p):
        if p:
            print(f'Error at token {p.type}, {p.value} at line {p.lineno} col {p.index}')
            self.errok()
        else:
            print('Syntax error at EOF')


TEXT = """
(true || false && true) || falseAlarm
"""

def tokens():
    for t in Lex().tokenize(TEXT):
        print(t)
        yield t

res = Pax().parse(tokens())
print()
pprint(res, indent=4, width=1)


Solution

  • You could change your regex to include word boundaries, i.e. FALSE = r'\bfalse\b'