Search code examples
pythonregexcompiler-construction

Tiny Language compiler using python and regex


Hello stack overflow users I hope you having a good so I'm doing this tiny language compiler for my homework tried using regex but the output is so weird First of all, I get an Identifier called 't' which is not used in my input And it doesn't separate Identifier 'x' from the semicolon thanks in advance for your help

Here is my input

read x;   {input an integer }
     if  0 < x   then     {  don’t compute if x <= 0 }
        fact  := 1;
        repeat 
           fact  := fact *  x;
            x  := x  -  1 
        until  x  =  0;
        write  fact   {  output  factorial of x }
     end 

And that's my code using regex

 # -*- coding: utf-8 -*-
"""
Created on Wed May 13 04:11:06 2020

@author: PC
"""

class OwnCompiler (object):
    def __init__ (self,file):
        import re
        self.file=open(file,"r").readlines()
        self.symbols = {
                "+":"PLUS_OP",
                "-":"MINUS_OP",
                "*":"MUL_OP",
                "/":"DIV_OP",
                "=":"EQUAL_OP",
                "<":"LESS_OP",
                ">":"GREATER_OP",
                "(":"LEFT_PARENTHESIS",
                ")":"RIGHT_PARENTHESIS",
                ":=":"ASSIGN",
                ";":"SEMICOLON",
                }
        self.commentPattern = re.compile(r".*({\n*\s*.*\s*})")
        self.reservePattern = re.compile(r"\s*(read|write|if|then|else|end|repeat|until)+\s*(.*)(then)*")
        self.symbolPattern = re.compile(r".*(\+|\*|-|/|=|<|>|\(|\)|;)")
        self.identifierSymbol = re.compile(r".*(\w+)\s+(:=)\s+(.*)")

    def compileOutput(self):
        self.fileWrite=open("output.txt","w")
        self.fileWrite.write("Type        Token\n==================\n")
        for i in self.file :
            print(i)
            self.getComment(i)
            self.getReserveWord(i)
            self.getIdentify(i)
        self.fileWrite.close()#end
    def getComment(self,text):
        try:
            self.fileWrite.write("COMMENT        "+self.commentPattern.match(text).group(1)+"\n")
        except:
            print("NO_COMMENT")
    def getReserveWord(self,text):
        self.Compiled = self.reservePattern.match(text)
        try:
            self.fileWrite.write("RESERVE_WORD        "+self.Compiled.group(1)+"\n")
            self.getSymbols(self.Compiled.group(2))
            try:
                self.fileWrite.write("RESERVE_WORD        "+self.Compiled.group(3)+"\n")
            except:
                print("NO_RESERVE_WORD2")
        except:
            print("NO_RESERVE_WORD")
    def getSymbols(self,text):
        self.Compiled= self.symbolPattern.match(text)
        self.GOT_TOKEN= self.getTokensSymbols(self.Compiled.group())
        try:
            self.fileWrite.write(self.GOT_TOKEN+"        "+self.Compiled.group()+"\n")
        except:
            print("NO_SYMBOLS")
    def getIdentify(self,text):
        self.Compiled = self.identifierSymbol.match(text)
        try:
            self.fileWrite.write("IDENTIFIER        "+self.Compiled.group(1)+"\n")
            self.getSymbols(text)
            for i in self.Compiled.group(3):
                if i ==" " :
                    continue
                if self.isNumber(i):
                    self.fileWrite.write("NUMBER        ")
                else:
                    self.fileWrite.write("WORD        ")
                self.fileWrite.write(self.Compiled.group(3)+"\n")
        except:
            print("NO_IDENTIFIRES")
    def getTokensSymbols(self,symbol):
        try: 
            return self.symbols[symbol]
        except:
            print("NOT_DEFINED_IN_SYMBOL_DICT")
            return "UNKNOWN"

    def isNumber(self,text):
         try:
             int(text)
             return True
         except:
             return False

if __name__ == "__main__":
    instance = OwnCompiler("input.txt")
    instance.compileOutput()

And here is my output

Type        Token
==================
COMMENT        { Sample program in TINY language – computes factorial }
COMMENT        {input an integer }
RESERVE_WORD        read
UNKNOWN        x;
COMMENT        {  don’t compute if x <= 0 }
RESERVE_WORD        if
UNKNOWN        0 < x   then     {  don’t compute if x <=
IDENTIFIER        t
UNKNOWN                fact  := 1;
RESERVE_WORD        repeat
IDENTIFIER        t
UNKNOWN                   fact  := fact *  x;
IDENTIFIER        x
UNKNOWN                    x  := x  -
RESERVE_WORD        until
UNKNOWN        x  =  0;
COMMENT        {  output  factorial of x }
RESERVE_WORD        write
RESERVE_WORD        end

Solution

  • If you are going to parse a language you need a 'lexer' that will return individual tokens ignoring whitespace and comments. Along these lines, just as an example:

    import re, collections
    
    class Lexer(object):
    
        WHITESPACE = r'(?P<WHITESPACE>\s+)'
        COMMENT = r'(?P<COMMENT>{[^}]*})'
        READ = r'(?P<READ>\bread\b)'
        WRITE = r'(?P<WRITE>\bwrite\b)'
        IF = r'(?P<IF>\bif\b)'
        THEN = r'(?P<THEN>\bthen\b)'
        ELSE = r'(?P<ELSE>\belse\b)'
        END = r'(?P<END>\bend\b)'
        REPEAT = r'(?P<REPEAT>\brepeat\b)'
        UNTIL = r'(?P<UNTIL>\buntil\b)'
        OPERATOR = r'(?P<OPERATOR>(?:[+*/=<>-]|:=))'
        LPAREN = r'(?P<LPAREN>\()'
        RPAREN = r'(?P<RPAREN>\))'
        IDENTIFIER = r'(?P<IDENTIFIER>[a-z]+)'
        INTEGER = r'(?P<INTEGER>\d+)'
        SEMICOLON = r'(?P<SEMICOLON>;)'
    
        regex = re.compile('|'.join([
            WHITESPACE,
            COMMENT,
            READ,
            WRITE,
            IF,
            THEN,
            ELSE,
            END,
            REPEAT,
            UNTIL,
            OPERATOR,
            LPAREN,
            RPAREN,
            IDENTIFIER,
            INTEGER,
            SEMICOLON
            ]))
    
        def __init__ (self, file):
    
            def generate_tokens(text):
                Token = collections.namedtuple('Token', ['type','value'])
                scanner = Lexer.regex.finditer(text)
                last_end = 0
                for m in scanner:
                    start = m.start()
                    end = m.end()
                    if start != last_end:
                        # skipped over text to find the next token implies that there was unrecognizable text or an "error token"
                        text = self.text[last_end:start]
                        token = Token('ERROR', text)
                        yield token
                    last_end = end
                    token = Token(m.lastgroup, m.group())
                    if token.type != 'WHITESPACE' and token.type != 'COMMENT':
                        yield token
                yield Token('EOF', '<end-of-file>')
    
    
            with open(file, "r") as f:
                text = f.read()
                self._token_generator = generate_tokens(text)
    
        def next_token(self):
            # if you call this past the "EOF" token you will get a StopIteration exception
            return self._token_generator.__next__()
    
    
    lexer = Lexer('input.txt')
    while True:
        token = lexer.next_token()
        print(token)
        if token.type == 'EOF':
            break
    

    Prints:

    Token(type='READ', value='read')
    Token(type='IDENTIFIER', value='x')
    Token(type='SEMICOLON', value=';')
    Token(type='IF', value='if')
    Token(type='INTEGER', value='0')
    Token(type='OPERATOR', value='<')
    Token(type='IDENTIFIER', value='x')
    Token(type='THEN', value='then')
    Token(type='IDENTIFIER', value='fact')
    Token(type='OPERATOR', value=':=')
    Token(type='INTEGER', value='1')
    Token(type='SEMICOLON', value=';')
    Token(type='REPEAT', value='repeat')
    Token(type='IDENTIFIER', value='fact')
    Token(type='OPERATOR', value=':=')
    Token(type='IDENTIFIER', value='fact')
    Token(type='OPERATOR', value='*')
    Token(type='IDENTIFIER', value='x')
    Token(type='SEMICOLON', value=';')
    Token(type='IDENTIFIER', value='x')
    Token(type='OPERATOR', value=':=')
    Token(type='IDENTIFIER', value='x')
    Token(type='OPERATOR', value='-')
    Token(type='INTEGER', value='1')
    Token(type='UNTIL', value='until')
    Token(type='IDENTIFIER', value='x')
    Token(type='OPERATOR', value='=')
    Token(type='INTEGER', value='0')
    Token(type='SEMICOLON', value=';')
    Token(type='WRITE', value='write')
    Token(type='IDENTIFIER', value='fact')
    Token(type='END', value='end')
    Token(type='EOF', value='<end-of-file>')