Search code examples
pythoncompiler-constructioncharacternewline

program reads character as newline


I have this simplified version of a compiler code I've been making, and a strange occurrence has got me confused. Here is the simplified code:

LETTER = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
          'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
          'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M'
          'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
          'X', 'Y', 'Z']

WHITESPACE = [' ', '\n', '\r', '\v', '\t', '\f']

SYMBOL = [';', ':', ',', '[', ']', '{', '}', '(', ')', '+', '-', '*', '=', '<']

input_file = open('input.txt', 'r')
line_num = 1

def get_token():
    global line_num

    value = ''
    input_char = ''

    value = input_file.read(1)

    if value in WHITESPACE:
        start = line_num
        if value == '\n':
            line_num += 1
        return "whitespace", value, start
    
    elif value in LETTER:
        input_char = input_file.read(1)
        while input_char != '':
            if input_char in LETTER:
                value += input_char
            elif input_char in SYMBOL or input_char in WHITESPACE:
                input_file.seek(input_file.tell() - 1)
                return 'id', value, line_num
            else:
                value += input_char
                return 'invalid', value, line_num
            input_char = input_file.read(1)
        return 'id', value, line_num
    elif value in SYMBOL:
        return 'symbol', value, line_num
    elif value == '':
        return None
    else:
        return 'invalid', value, line_num
        


while True:
    token = get_token()
    if token:
        print(f"({token[2]}, {token[0]}, {token[1]})")
    else:
        break

input_file.close()

this is the content of the input file:

prod;

}

and this is the output I'm trying to achieve:

(1, id, prod)
(1, symbol, ;)
(1, whitespace, 
)
(2, whitespace, 
)
(3, symbol, })

but instead, this is the output I get:

(1, id, prod)
(1, whitespace, 
)
(2, whitespace, 
)
(3, whitespace, 
)
(4, symbol, })

After checking the value of the variables and file pointer at each stage, I came to the conclusion that the program really is recognizing the character ';' as newline, and it's also the case with any other SYMBOL I put in it's place. What's even more confusing is that if I write the fist like as 'prod ;' or 'prod;;' of 'prod; ' it gives the correct output. How is that happening?


Solution

  • You seem to work under not mentioned constrains (i.e. "the thing is I'm not allowed to use any imported libraries" from comment).

    Your seeking does not work as you do it. Instead of using tell, keep track of the position yourself:

    pos = 0
    
    # handle your own positioning    
    def get_char_and_pos(f):
        """Reads one character from file, returns character and position"""
        global pos
        pos += 1
        return f.read(1), pos-1
    

    and replace all instances of input_file.read(1) by a call to your position tracking function:

    # sets are faster for 'in' checks
    LETTER = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
    WHITESPACE = set(" \n\r\v\t\f")
    SYMBOL = set(";:,[]{}()+-*=<")
    
    # write demo file
    with open("input.txt", "w") as f:
        f.write("prod;\n\n}") 
    
    input_file = open('input.txt', 'r+')
    line_num = 1
    
    def get_token():
        global line_num
    
        value = ''
        input_char = ''
    
        value, value_pos = get_char_and_pos(input_file)    # use function 
    
        if value in WHITESPACE:
            start = line_num
            if value == '\n':
                line_num += 1
            return "whitespace", value, start
        
        elif value in LETTER:
            input_char, value_pos = get_char_and_pos(input_file)   # use function 
            while input_char != '':
                if input_char in LETTER:
                    value += input_char
                elif input_char in SYMBOL or input_char in WHITESPACE:
                    input_file.seek(value_pos)      # jump to correct position
                    return 'id', value, line_num
                else:
                    value += input_char
                    return 'invalid', value, line_num
                input_char, value_pos = get_char_and_pos(input_file)  # use function 
            return 'id', value, line_num
        elif value in SYMBOL:
            return 'symbol', value, line_num
        elif value == '':
            return None
        else:
            return 'invalid', value, line_num
            
    
    
    while True:
        token = get_token()
        if token:
            print(f"({token[2]}, {token[0]}, {token[1]})")
        else:
            break
    
    input_file.close()
    

    Output:

    (1, id, prod)
    (1, symbol, ;)  
    (1, whitespace, 
    )
    (2, whitespace, 
    )
    (3, symbol, })