Hello stack overflow users I hope you having a good so I'm doing this tiny language compiler for my homework tried using regex but the output is so weird First of all, I get an Identifier called 't' which is not used in my input And it doesn't separate Identifier 'x' from the semicolon thanks in advance for your help
Here is my input
read x; {input an integer }
if 0 < x then { don’t compute if x <= 0 }
fact := 1;
repeat
fact := fact * x;
x := x - 1
until x = 0;
write fact { output factorial of x }
end
And that's my code using regex
# -*- coding: utf-8 -*-
"""
Created on Wed May 13 04:11:06 2020
@author: PC
"""
class OwnCompiler (object):
def __init__ (self,file):
import re
self.file=open(file,"r").readlines()
self.symbols = {
"+":"PLUS_OP",
"-":"MINUS_OP",
"*":"MUL_OP",
"/":"DIV_OP",
"=":"EQUAL_OP",
"<":"LESS_OP",
">":"GREATER_OP",
"(":"LEFT_PARENTHESIS",
")":"RIGHT_PARENTHESIS",
":=":"ASSIGN",
";":"SEMICOLON",
}
self.commentPattern = re.compile(r".*({\n*\s*.*\s*})")
self.reservePattern = re.compile(r"\s*(read|write|if|then|else|end|repeat|until)+\s*(.*)(then)*")
self.symbolPattern = re.compile(r".*(\+|\*|-|/|=|<|>|\(|\)|;)")
self.identifierSymbol = re.compile(r".*(\w+)\s+(:=)\s+(.*)")
def compileOutput(self):
self.fileWrite=open("output.txt","w")
self.fileWrite.write("Type Token\n==================\n")
for i in self.file :
print(i)
self.getComment(i)
self.getReserveWord(i)
self.getIdentify(i)
self.fileWrite.close()#end
def getComment(self,text):
try:
self.fileWrite.write("COMMENT "+self.commentPattern.match(text).group(1)+"\n")
except:
print("NO_COMMENT")
def getReserveWord(self,text):
self.Compiled = self.reservePattern.match(text)
try:
self.fileWrite.write("RESERVE_WORD "+self.Compiled.group(1)+"\n")
self.getSymbols(self.Compiled.group(2))
try:
self.fileWrite.write("RESERVE_WORD "+self.Compiled.group(3)+"\n")
except:
print("NO_RESERVE_WORD2")
except:
print("NO_RESERVE_WORD")
def getSymbols(self,text):
self.Compiled= self.symbolPattern.match(text)
self.GOT_TOKEN= self.getTokensSymbols(self.Compiled.group())
try:
self.fileWrite.write(self.GOT_TOKEN+" "+self.Compiled.group()+"\n")
except:
print("NO_SYMBOLS")
def getIdentify(self,text):
self.Compiled = self.identifierSymbol.match(text)
try:
self.fileWrite.write("IDENTIFIER "+self.Compiled.group(1)+"\n")
self.getSymbols(text)
for i in self.Compiled.group(3):
if i ==" " :
continue
if self.isNumber(i):
self.fileWrite.write("NUMBER ")
else:
self.fileWrite.write("WORD ")
self.fileWrite.write(self.Compiled.group(3)+"\n")
except:
print("NO_IDENTIFIRES")
def getTokensSymbols(self,symbol):
try:
return self.symbols[symbol]
except:
print("NOT_DEFINED_IN_SYMBOL_DICT")
return "UNKNOWN"
def isNumber(self,text):
try:
int(text)
return True
except:
return False
if __name__ == "__main__":
instance = OwnCompiler("input.txt")
instance.compileOutput()
And here is my output
Type Token
==================
COMMENT { Sample program in TINY language – computes factorial }
COMMENT {input an integer }
RESERVE_WORD read
UNKNOWN x;
COMMENT { don’t compute if x <= 0 }
RESERVE_WORD if
UNKNOWN 0 < x then { don’t compute if x <=
IDENTIFIER t
UNKNOWN fact := 1;
RESERVE_WORD repeat
IDENTIFIER t
UNKNOWN fact := fact * x;
IDENTIFIER x
UNKNOWN x := x -
RESERVE_WORD until
UNKNOWN x = 0;
COMMENT { output factorial of x }
RESERVE_WORD write
RESERVE_WORD end
If you are going to parse a language you need a 'lexer' that will return individual tokens ignoring whitespace and comments. Along these lines, just as an example:
import re, collections
class Lexer(object):
WHITESPACE = r'(?P<WHITESPACE>\s+)'
COMMENT = r'(?P<COMMENT>{[^}]*})'
READ = r'(?P<READ>\bread\b)'
WRITE = r'(?P<WRITE>\bwrite\b)'
IF = r'(?P<IF>\bif\b)'
THEN = r'(?P<THEN>\bthen\b)'
ELSE = r'(?P<ELSE>\belse\b)'
END = r'(?P<END>\bend\b)'
REPEAT = r'(?P<REPEAT>\brepeat\b)'
UNTIL = r'(?P<UNTIL>\buntil\b)'
OPERATOR = r'(?P<OPERATOR>(?:[+*/=<>-]|:=))'
LPAREN = r'(?P<LPAREN>\()'
RPAREN = r'(?P<RPAREN>\))'
IDENTIFIER = r'(?P<IDENTIFIER>[a-z]+)'
INTEGER = r'(?P<INTEGER>\d+)'
SEMICOLON = r'(?P<SEMICOLON>;)'
regex = re.compile('|'.join([
WHITESPACE,
COMMENT,
READ,
WRITE,
IF,
THEN,
ELSE,
END,
REPEAT,
UNTIL,
OPERATOR,
LPAREN,
RPAREN,
IDENTIFIER,
INTEGER,
SEMICOLON
]))
def __init__ (self, file):
def generate_tokens(text):
Token = collections.namedtuple('Token', ['type','value'])
scanner = Lexer.regex.finditer(text)
last_end = 0
for m in scanner:
start = m.start()
end = m.end()
if start != last_end:
# skipped over text to find the next token implies that there was unrecognizable text or an "error token"
text = self.text[last_end:start]
token = Token('ERROR', text)
yield token
last_end = end
token = Token(m.lastgroup, m.group())
if token.type != 'WHITESPACE' and token.type != 'COMMENT':
yield token
yield Token('EOF', '<end-of-file>')
with open(file, "r") as f:
text = f.read()
self._token_generator = generate_tokens(text)
def next_token(self):
# if you call this past the "EOF" token you will get a StopIteration exception
return self._token_generator.__next__()
lexer = Lexer('input.txt')
while True:
token = lexer.next_token()
print(token)
if token.type == 'EOF':
break
Prints:
Token(type='READ', value='read')
Token(type='IDENTIFIER', value='x')
Token(type='SEMICOLON', value=';')
Token(type='IF', value='if')
Token(type='INTEGER', value='0')
Token(type='OPERATOR', value='<')
Token(type='IDENTIFIER', value='x')
Token(type='THEN', value='then')
Token(type='IDENTIFIER', value='fact')
Token(type='OPERATOR', value=':=')
Token(type='INTEGER', value='1')
Token(type='SEMICOLON', value=';')
Token(type='REPEAT', value='repeat')
Token(type='IDENTIFIER', value='fact')
Token(type='OPERATOR', value=':=')
Token(type='IDENTIFIER', value='fact')
Token(type='OPERATOR', value='*')
Token(type='IDENTIFIER', value='x')
Token(type='SEMICOLON', value=';')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value=':=')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value='-')
Token(type='INTEGER', value='1')
Token(type='UNTIL', value='until')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value='=')
Token(type='INTEGER', value='0')
Token(type='SEMICOLON', value=';')
Token(type='WRITE', value='write')
Token(type='IDENTIFIER', value='fact')
Token(type='END', value='end')
Token(type='EOF', value='<end-of-file>')