Search code examples
pythonyacclexply

Syntax error of Python PLY parser


I'm writing simplified MODULA-2 grammar using Python PLY.

But I'm getting syntax error:

$ python3 m2.py
Syntax error at 'MODULE'

and I cannot figure out what is the problem with rules.

Here is the grammar:

import ply.lex as lex
import ply.yacc as yacc

# =============================================================================
# Lexer rules
# =============================================================================

tokens = (
    # Keywords
    'RETURN', 'IF', 'THEN', 'VAR', 'MODULE', 'BEGIN', 'END',
    # Contants
    'NUMBER',
    # Operators
    'PLUS', 'MINUS', 'TIMES', 'DIV', 'MOD', 'ASSIGN_OP',
    # Separators
    'LPAR', 'RPAR', 'PERIOD', 'COLON', 'SEMICOLON',
    # Identifier
    'IDENT',
    )

# Tokens

t_NUMBER        = r'\d+'
t_PLUS          = r'\+'
t_MINUS         = r'-'
t_TIMES         = r'\*'
t_LPAR          = r'\('
t_RPAR          = r'\)'
t_PERIOD        = r'\.'
t_COLON         = r':'
t_SEMICOLON     = r';'
t_ASSIGN_OP     = r':='
t_IDENT         = r'[a-zA-Z][a-zA-Z0-9]*'

# Ignored characters
t_ignore = ' \t'

def t_error(t):
    print("Illegal character '%s'" % t.value[0])
    t.lexer.skip(1)

# Build the lexer
lexer = lex.lex()


# =============================================================================
# Parser rules
# =============================================================================

precedence = (
    ('left', 'PLUS', 'MINUS'),
    ('left', 'TIMES', 'DIV'),
)

def p_add_operator(t):
    """ add_operator : PLUS
                     | MINUS
    """
    pass

def p_mul_operator(t):
    """ mul_operator : TIMES
                     | DIV
                     | MOD
    """
    pass

def p_simple_expression(t):
    """ expression : term
                   | expression add_operator term
    """
    pass

def p_term(t):
    """ term : factor
             | term mul_operator factor
    """
    pass

def p_factor(t):
    """ factor : NUMBER
               | IDENT
               | LPAR expression RPAR
    """
    pass

def p_statement(t):
    """ statement : IDENT
                  | IDENT ASSIGN_OP expression
                  | IF expression THEN statement_sequence END
                  | RETURN expression
    """
    pass

def p_statement_sequence(t):
    """ statement_sequence : statement
                           | statement_sequence SEMICOLON statement
    """
    pass

def p_block(t):
    """ block : declaration_list BEGIN statement_sequence END
    """
    pass

def p_declaration_list(t):
    """ declaration_list : declaration
                         | declaration_list declaration
    """
    pass

def p_declaration(t):
    """ declaration : VAR IDENT COLON IDENT SEMICOLON
    """
    pass

def p_program_module(t):
    """ program_module : MODULE IDENT SEMICOLON block IDENT PERIOD
    """
    pass

def p_error(t):
    print("Syntax error at '%s'" % t.value)

parser = yacc.yacc(start='program_module')

if __name__ == "__main__":
    s = "MODULE test; VAR x: INTEGER; BEGIN x := 10 END test."
    parser.parse(s)

The interesting thing is that the same grammar rules written for lex/yacc are working fine. Can somebody help me with this?


Solution

  • AFAIK, ply.lex has not enough magic to know that you want that the special MODULE word to be the token MODULE.

    With your definition, the simple test:

    lexer.input("MODULE test; VAR x: INTEGER; BEGIN x := 10 END test.")
    for tok in lexer:
        print(tok)
    

    outputs:

    LexToken(IDENT,'MODULE',1,0)
    LexToken(IDENT,'test',1,7)
    LexToken(SEMICOLON,';',1,11)
    LexToken(IDENT,'VAR',1,13)
    LexToken(IDENT,'x',1,17)
    LexToken(COLON,':',1,18)
    LexToken(IDENT,'INTEGER',1,20)
    LexToken(SEMICOLON,';',1,27)
    LexToken(IDENT,'BEGIN',1,29)
    LexToken(IDENT,'x',1,35)
    LexToken(ASSIGN_OP,':=',1,37)
    LexToken(NUMBER,'10',1,40)
    LexToken(IDENT,'END',1,43)
    LexToken(IDENT,'test',1,47)
    LexToken(PERIOD,'.',1,51)
    

    The correct way to process keywords is to identify them inside the IDENT token:

    =============================================================================
    # Lexer rules
    # =============================================================================
    # Keywords
    keywords = ( 'RETURN', 'IF', 'THEN', 'VAR', 'MODULE', 'BEGIN', 'END' )
    tokens = keywords + (
        # Contants
        'NUMBER',
        ...
    

    and

    def t_IDENT(t):
        r'[a-zA-Z][a-zA-Z0-9]*'
        if t.value in keywords:  # is this a keyword
            t.type = t.value
        return t
    

    The same lexer control now correctly gives:

    LexToken(MODULE,'MODULE',1,0)
    LexToken(IDENT,'test',1,7)
    LexToken(SEMICOLON,';',1,11)
    LexToken(VAR,'VAR',1,13)
    LexToken(IDENT,'x',1,17)
    LexToken(COLON,':',1,18)
    LexToken(IDENT,'INTEGER',1,20)
    LexToken(SEMICOLON,';',1,27)
    LexToken(BEGIN,'BEGIN',1,29)
    LexToken(IDENT,'x',1,35)
    LexToken(ASSIGN_OP,':=',1,37)
    LexToken(NUMBER,'10',1,40)
    LexToken(END,'END',1,43)
    LexToken(IDENT,'test',1,47)
    LexToken(PERIOD,'.',1,51)
    

    and the parsing shows no error.