python compiler-construction character newline

program reads character as newline

I have this simplified version of a compiler code I've been making, and a strange occurrence has got me confused. Here is the simplified code:

LETTER = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
          'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
          'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M'
          'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
          'X', 'Y', 'Z']

WHITESPACE = [' ', '\n', '\r', '\v', '\t', '\f']

SYMBOL = [';', ':', ',', '[', ']', '{', '}', '(', ')', '+', '-', '*', '=', '<']

input_file = open('input.txt', 'r')
line_num = 1

def get_token():
    global line_num

    value = ''
    input_char = ''

    value = input_file.read(1)

    if value in WHITESPACE:
        start = line_num
        if value == '\n':
            line_num += 1
        return "whitespace", value, start
    
    elif value in LETTER:
        input_char = input_file.read(1)
        while input_char != '':
            if input_char in LETTER:
                value += input_char
            elif input_char in SYMBOL or input_char in WHITESPACE:
                input_file.seek(input_file.tell() - 1)
                return 'id', value, line_num
            else:
                value += input_char
                return 'invalid', value, line_num
            input_char = input_file.read(1)
        return 'id', value, line_num
    elif value in SYMBOL:
        return 'symbol', value, line_num
    elif value == '':
        return None
    else:
        return 'invalid', value, line_num
        


while True:
    token = get_token()
    if token:
        print(f"({token[2]}, {token[0]}, {token[1]})")
    else:
        break

input_file.close()

this is the content of the input file:

prod;

}

and this is the output I'm trying to achieve:

(1, id, prod)
(1, symbol, ;)
(1, whitespace, 
)
(2, whitespace, 
)
(3, symbol, })

but instead, this is the output I get:

(1, id, prod)
(1, whitespace, 
)
(2, whitespace, 
)
(3, whitespace, 
)
(4, symbol, })

After checking the value of the variables and file pointer at each stage, I came to the conclusion that the program really is recognizing the character ';' as newline, and it's also the case with any other SYMBOL I put in it's place. What's even more confusing is that if I write the fist like as 'prod ;' or 'prod;;' of 'prod; ' it gives the correct output. How is that happening?

Solution

You seem to work under not mentioned constrains (i.e. "the thing is I'm not allowed to use any imported libraries" from comment).

Your seeking does not work as you do it. Instead of using tell, keep track of the position yourself:

pos = 0

# handle your own positioning    
def get_char_and_pos(f):
    """Reads one character from file, returns character and position"""
    global pos
    pos += 1
    return f.read(1), pos-1

and replace all instances of input_file.read(1) by a call to your position tracking function:

# sets are faster for 'in' checks
LETTER = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
WHITESPACE = set(" \n\r\v\t\f")
SYMBOL = set(";:,[]{}()+-*=<")

# write demo file
with open("input.txt", "w") as f:
    f.write("prod;\n\n}") 

input_file = open('input.txt', 'r+')
line_num = 1

def get_token():
    global line_num

    value = ''
    input_char = ''

    value, value_pos = get_char_and_pos(input_file)    # use function 

    if value in WHITESPACE:
        start = line_num
        if value == '\n':
            line_num += 1
        return "whitespace", value, start
    
    elif value in LETTER:
        input_char, value_pos = get_char_and_pos(input_file)   # use function 
        while input_char != '':
            if input_char in LETTER:
                value += input_char
            elif input_char in SYMBOL or input_char in WHITESPACE:
                input_file.seek(value_pos)      # jump to correct position
                return 'id', value, line_num
            else:
                value += input_char
                return 'invalid', value, line_num
            input_char, value_pos = get_char_and_pos(input_file)  # use function 
        return 'id', value, line_num
    elif value in SYMBOL:
        return 'symbol', value, line_num
    elif value == '':
        return None
    else:
        return 'invalid', value, line_num
        


while True:
    token = get_token()
    if token:
        print(f"({token[2]}, {token[0]}, {token[1]})")
    else:
        break

input_file.close()

Output:

(1, id, prod)
(1, symbol, ;)  
(1, whitespace, 
)
(2, whitespace, 
)
(3, symbol, })