I'm writing simplified MODULA-2 grammar using Python PLY.
But I'm getting syntax error:
$ python3 m2.py
Syntax error at 'MODULE'
and I cannot figure out what is the problem with rules.
Here is the grammar:
import ply.lex as lex
import ply.yacc as yacc
# =============================================================================
# Lexer rules
# =============================================================================
tokens = (
# Keywords
'RETURN', 'IF', 'THEN', 'VAR', 'MODULE', 'BEGIN', 'END',
# Contants
'NUMBER',
# Operators
'PLUS', 'MINUS', 'TIMES', 'DIV', 'MOD', 'ASSIGN_OP',
# Separators
'LPAR', 'RPAR', 'PERIOD', 'COLON', 'SEMICOLON',
# Identifier
'IDENT',
)
# Tokens
t_NUMBER = r'\d+'
t_PLUS = r'\+'
t_MINUS = r'-'
t_TIMES = r'\*'
t_LPAR = r'\('
t_RPAR = r'\)'
t_PERIOD = r'\.'
t_COLON = r':'
t_SEMICOLON = r';'
t_ASSIGN_OP = r':='
t_IDENT = r'[a-zA-Z][a-zA-Z0-9]*'
# Ignored characters
t_ignore = ' \t'
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# Build the lexer
lexer = lex.lex()
# =============================================================================
# Parser rules
# =============================================================================
precedence = (
('left', 'PLUS', 'MINUS'),
('left', 'TIMES', 'DIV'),
)
def p_add_operator(t):
""" add_operator : PLUS
| MINUS
"""
pass
def p_mul_operator(t):
""" mul_operator : TIMES
| DIV
| MOD
"""
pass
def p_simple_expression(t):
""" expression : term
| expression add_operator term
"""
pass
def p_term(t):
""" term : factor
| term mul_operator factor
"""
pass
def p_factor(t):
""" factor : NUMBER
| IDENT
| LPAR expression RPAR
"""
pass
def p_statement(t):
""" statement : IDENT
| IDENT ASSIGN_OP expression
| IF expression THEN statement_sequence END
| RETURN expression
"""
pass
def p_statement_sequence(t):
""" statement_sequence : statement
| statement_sequence SEMICOLON statement
"""
pass
def p_block(t):
""" block : declaration_list BEGIN statement_sequence END
"""
pass
def p_declaration_list(t):
""" declaration_list : declaration
| declaration_list declaration
"""
pass
def p_declaration(t):
""" declaration : VAR IDENT COLON IDENT SEMICOLON
"""
pass
def p_program_module(t):
""" program_module : MODULE IDENT SEMICOLON block IDENT PERIOD
"""
pass
def p_error(t):
print("Syntax error at '%s'" % t.value)
parser = yacc.yacc(start='program_module')
if __name__ == "__main__":
s = "MODULE test; VAR x: INTEGER; BEGIN x := 10 END test."
parser.parse(s)
The interesting thing is that the same grammar rules written for lex/yacc are working fine. Can somebody help me with this?
AFAIK, ply.lex has not enough magic to know that you want that the special MODULE
word to be the token MODULE
.
With your definition, the simple test:
lexer.input("MODULE test; VAR x: INTEGER; BEGIN x := 10 END test.")
for tok in lexer:
print(tok)
outputs:
LexToken(IDENT,'MODULE',1,0)
LexToken(IDENT,'test',1,7)
LexToken(SEMICOLON,';',1,11)
LexToken(IDENT,'VAR',1,13)
LexToken(IDENT,'x',1,17)
LexToken(COLON,':',1,18)
LexToken(IDENT,'INTEGER',1,20)
LexToken(SEMICOLON,';',1,27)
LexToken(IDENT,'BEGIN',1,29)
LexToken(IDENT,'x',1,35)
LexToken(ASSIGN_OP,':=',1,37)
LexToken(NUMBER,'10',1,40)
LexToken(IDENT,'END',1,43)
LexToken(IDENT,'test',1,47)
LexToken(PERIOD,'.',1,51)
The correct way to process keywords is to identify them inside the IDENT token:
=============================================================================
# Lexer rules
# =============================================================================
# Keywords
keywords = ( 'RETURN', 'IF', 'THEN', 'VAR', 'MODULE', 'BEGIN', 'END' )
tokens = keywords + (
# Contants
'NUMBER',
...
and
def t_IDENT(t):
r'[a-zA-Z][a-zA-Z0-9]*'
if t.value in keywords: # is this a keyword
t.type = t.value
return t
The same lexer control now correctly gives:
LexToken(MODULE,'MODULE',1,0)
LexToken(IDENT,'test',1,7)
LexToken(SEMICOLON,';',1,11)
LexToken(VAR,'VAR',1,13)
LexToken(IDENT,'x',1,17)
LexToken(COLON,':',1,18)
LexToken(IDENT,'INTEGER',1,20)
LexToken(SEMICOLON,';',1,27)
LexToken(BEGIN,'BEGIN',1,29)
LexToken(IDENT,'x',1,35)
LexToken(ASSIGN_OP,':=',1,37)
LexToken(NUMBER,'10',1,40)
LexToken(END,'END',1,43)
LexToken(IDENT,'test',1,47)
LexToken(PERIOD,'.',1,51)
and the parsing shows no error.