I have this simplified version of a compiler code I've been making, and a strange occurrence has got me confused. Here is the simplified code:
LETTER = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M'
'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
'X', 'Y', 'Z']
WHITESPACE = [' ', '\n', '\r', '\v', '\t', '\f']
SYMBOL = [';', ':', ',', '[', ']', '{', '}', '(', ')', '+', '-', '*', '=', '<']
input_file = open('input.txt', 'r')
line_num = 1
def get_token():
global line_num
value = ''
input_char = ''
value = input_file.read(1)
if value in WHITESPACE:
start = line_num
if value == '\n':
line_num += 1
return "whitespace", value, start
elif value in LETTER:
input_char = input_file.read(1)
while input_char != '':
if input_char in LETTER:
value += input_char
elif input_char in SYMBOL or input_char in WHITESPACE:
input_file.seek(input_file.tell() - 1)
return 'id', value, line_num
else:
value += input_char
return 'invalid', value, line_num
input_char = input_file.read(1)
return 'id', value, line_num
elif value in SYMBOL:
return 'symbol', value, line_num
elif value == '':
return None
else:
return 'invalid', value, line_num
while True:
token = get_token()
if token:
print(f"({token[2]}, {token[0]}, {token[1]})")
else:
break
input_file.close()
this is the content of the input file:
prod;
}
and this is the output I'm trying to achieve:
(1, id, prod)
(1, symbol, ;)
(1, whitespace,
)
(2, whitespace,
)
(3, symbol, })
but instead, this is the output I get:
(1, id, prod)
(1, whitespace,
)
(2, whitespace,
)
(3, whitespace,
)
(4, symbol, })
After checking the value of the variables and file pointer at each stage, I came to the conclusion that the program really is recognizing the character ';' as newline, and it's also the case with any other SYMBOL I put in it's place. What's even more confusing is that if I write the fist like as 'prod ;' or 'prod;;' of 'prod; ' it gives the correct output. How is that happening?
You seem to work under not mentioned constrains (i.e. "the thing is I'm not allowed to use any imported libraries" from comment).
Your seeking does not work as you do it. Instead of using tell
, keep track of the position yourself:
pos = 0
# handle your own positioning
def get_char_and_pos(f):
"""Reads one character from file, returns character and position"""
global pos
pos += 1
return f.read(1), pos-1
and replace all instances of input_file.read(1)
by a call to your position tracking function:
# sets are faster for 'in' checks
LETTER = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
WHITESPACE = set(" \n\r\v\t\f")
SYMBOL = set(";:,[]{}()+-*=<")
# write demo file
with open("input.txt", "w") as f:
f.write("prod;\n\n}")
input_file = open('input.txt', 'r+')
line_num = 1
def get_token():
global line_num
value = ''
input_char = ''
value, value_pos = get_char_and_pos(input_file) # use function
if value in WHITESPACE:
start = line_num
if value == '\n':
line_num += 1
return "whitespace", value, start
elif value in LETTER:
input_char, value_pos = get_char_and_pos(input_file) # use function
while input_char != '':
if input_char in LETTER:
value += input_char
elif input_char in SYMBOL or input_char in WHITESPACE:
input_file.seek(value_pos) # jump to correct position
return 'id', value, line_num
else:
value += input_char
return 'invalid', value, line_num
input_char, value_pos = get_char_and_pos(input_file) # use function
return 'id', value, line_num
elif value in SYMBOL:
return 'symbol', value, line_num
elif value == '':
return None
else:
return 'invalid', value, line_num
while True:
token = get_token()
if token:
print(f"({token[2]}, {token[0]}, {token[1]})")
else:
break
input_file.close()
Output:
(1, id, prod)
(1, symbol, ;)
(1, whitespace,
)
(2, whitespace,
)
(3, symbol, })