Search code examples
pythonply

Enforce structure of file with PLY


I am trying to process input file with description of algorithm behavior. I am using python's PLY module for defining lexer and parser. I stumbled upon problem of defining grammar which will enforce user to correctly write this file.

File

# Beginning of the first section
STATES = INITIATOR, IDLE, DONE;
INIT = INITIATOR, IDLE;
TERM = DONE;
# End of first section

# Beginning of the second section
INITIATOR
    RANDOM
    begin
       SEND(x, NEIGHBORS);
       BECOME(DONE);
    end

IDLE
    RECEIVE(x)
    begin
        SEND(x, NEIGHBORS);
        BECOME(DONE);
    end
# End of second section

Lexer

import ply.lex as lex
from soda.helpers import prepare_file


class Lexer(object):
    keywords = (
        'INIT', 'TERM', 'STATES', 'REGISTERS',
        'begin', 'end',
        'SEND', 'BECOME'
    )

    tokens = keywords + (
        'NAME', 'EQUALS', 'COMMA', 'SEMICOLON',
        'LPAREN', 'RPAREN'
    )

    # Tokens

    t_EQUALS = r'='
    t_COMMA = r','
    t_SEMICOLON = r';'
    t_STATES = r'STATES'
    t_REGISTERS = r'REGISTERS'
    t_INIT = r'INIT'
    t_TERM = r'TERM'
    t_begin = r'begin'
    t_end = r'end'
    t_SEND = r'SEND'
    t_BECOME = r'BECOME'
    t_LPAREN = r'\('
    t_RPAREN = r'\)'

    # Ignored characters
    t_ignore = ' \t\n'

    def t_NAME(self, t):
        r'[a-zA-Z][a-zA-Z]*'
        if t.value in self.keywords:  # is this a keyword?
            t.type = t.value
        return t

    def t_error(self, t):
        print ("Illegal character {0} at line {1}".format(t.value[0], t.lineno))
        t.lexer.skip(1)

    def build(self, **kwargs):
        self._lexer = lex.lex(module=self, **kwargs)

    @prepare_file
    def lexical_analysis(self, file):
        print ("Started lexical analysis...")
        for line in file:
            try:
                lex_input = line
            except EOFError:
                break

            self._lexer.input(lex_input)
            while True:
                token = self._lexer.token()
                if not token:
                    break
                print ("  ", token)

Parser

import ply.yacc as yacc
from soda.helpers import prepare_file


class Parser(object):
    def p_algorithm(self, p):
        ''' algorithm : first_section second_section'''

    def p_first_section(self, p):
        ''' first_section : STATES EQUALS states_list SEMICOLON
                          | REGISTERS EQUALS register_list SEMICOLON
                          | INIT EQUALS init_list SEMICOLON
                          | TERM EQUALS term_list SEMICOLON'''

    def p_states_list(self, p):
        ''' states_list  : state_term
                         | states_list COMMA state_term'''

    def p_state_term(self, p):
        ''' state_term : NAME'''
        self.behavior.states.append(p[1])

    def p_register_list(self, p):
        ''' register_list : register_term
                          | register_list COMMA register_term'''

    def p_register_term(self, p):
        ''' register_term : NAME'''
        self.behavior.registers.append(p[1])

    def p_init_list(self, p):
        ''' init_list : init_term
                      | init_list COMMA init_term'''

    def p_init_term(self, p):
        ''' init_term : NAME'''
        self.behavior.init_states.append(p[1])

    def p_term_list(self, p):
        ''' term_list : term_term
                      | term_list COMMA term_term'''

    def p_term_term(self, p):
        ''' term_term : NAME'''
        self.behavior.term_states.append(p[1])

    def p_second_section(self, p):
        ''' second_section : NAME begin commands end'''

    def p_error(self, p):
        print("Syntax error in input! -> {}".format(p))

    def build(self, lexer, behavior):
        self.lexer = lexer
        self.behavior = behavior
        self.tokens = lexer.tokens
        self._parser = yacc.yacc(module=self)

    @prepare_file
    def parsing(self, file):
        for line in file:
            try:
                parser_input = line
                print (line)
            except EOFError:
                break

        self._parser.parse(parser_input, lexer=self.lexer._lexer)

Parsing results in syntax error and I am not sure how to define rules to enforce the consistency of file with algorithm behavior. first_section is parsed ok and problem is second_section. My solution defines that algorithm : first_section second_section and it is not working. I tried to define it like algorithm: first_section | second_section and it works good but this rule states that first and second section can be switched in file.

So my question is how to enforce it with rules so user will keep the input file consistent.

Error output

enter STATES = INITIATOR, IDLE, DONE;

Syntax error in input! -> None
INIT = INITIATOR, IDLE;

Syntax error in input! -> None
TERM = DONE;

Syntax error in input! -> None
INITIATOR

Syntax error in input! -> LexToken(NAME,'INITIATOR',1,0)
begin
Syntax error in input! -> LexToken(begin,'begin',1,0)

Program just states there is error in syntax. Problem is not with lexical analysis but with defined grammar. I can define it in such way that input is accepted but for example user would be able to switch first_section with second_section.

Edit

I think it is not clear from this question what I want to achieve or my problem so I voted to close it. I came up with idea how to better state what I am looking for so I want to raise new question.


Solution

  • Oups! Your grammar parses the file line by line, which is at least uncommon and does not allow to control the ordering of lines. IMHO, you should parse the file as a whole. The trick is to pass the parser a tokenfunc function that will feed the lexer with one line at a time, and declare each section to be composed of lines:

    class Parser(object):
        def p_algorithm(self, p):
            ''' algorithm : first_section second_section'''
    
        def p_first_section(self, p):
            ''' first_section : first_section_line
                              | first_section_line first_section'''
    
        def p_first_section_line(self, p):
            ''' first_section_line : STATES EQUALS states_list SEMICOLON
                              | REGISTERS EQUALS register_list SEMICOLON
                              | INIT EQUALS init_list SEMICOLON
                              | TERM EQUALS term_list SEMICOLON'''
    
        ...
        # same for second section...
    
        @prepare_file
        def parsing(self, file):
            def get_token():
                'a tokenizer that automatically feeds the lexer with the next line'
                while True:
                    tok = self.lexer._lexer.token()
                    if tok is not None: return tok
                    try:
                        line = next(file)
                        self.lexer._lexer.input(line)
                    except StopIteration:
                        return None
    
            self._parser.parse("", lexer=self.lexer._lexer, tokenfunc = get_token)