Search code examples
parsingloopspython-3.xruntime-errorscanning

Python3 - Creating a scanner for a compiler and getting errors upon testing


I am trying to create a scanner for a compiler which reads a simple language. I created a test file called program, which contains:

z := 2;
if z < 3 then
   z := 1
end

To run the program, I use terminal, and run the command line:

python3 scanner.py program tokens

I want the output to be put into the text file tokens, but nothing appears when I do this. During run time, the program runs but does not do anything. I tried to put <> around program but I got a ValueError: need more than 1 value to unpack.

My codes is as follows:

import re
import sys

class Scanner:
    '''The interface comprises the methods lookahead and consume.
      Other methods should not be called from outside of this class.'''

 def __init__(self, input_file):
  '''Reads the whole input_file to input_string, which remains constant.
     current_char_index counts how many characters of input_string have
     been consumed.
     current_token holds the most recently found token and the
     corresponding part of input_string.'''

    # source code of the program to be compiled
    self.input_string = input_file.read()

    # index where the unprocessed part of input_string starts
    self.current_char_index = 0

    # a pair (most recently read token, matched substring of input_string)
    self.current_token = self.get_token()

 def skip_white_space(self):
    '''Consumes all characters in input_string up to the next
      non-white-space character.'''
    if (self.current_char_index >= len(self.input_string) - 1):
        return

    while self.input_string[self.current_char_index].isspace():
        self.current_char_index += 1

 def get_token(self):
    '''Returns the next token and the part of input_string it matched.
      The returned token is None if there is no next token.
      The characters up to the end of the token are consumed.'''
    self.skip_white_space()
    # find the longest prefix of input_string that matches a token
    token, longest = None, ''
    for (t, r) in Token.token_regexp:
        match = re.match(r, self.input_string[self.current_char_index:])
        if match and match.end() > len(longest):
            token, longest = t, match.group()
    # consume the token by moving the index to the end of the matched part
    self.current_char_index += len(longest)
    return (token, longest)

 def lookahead(self):
    '''Returns the next token without consuming it.
      Returns None if there is no next token.'''
    return self.current_token[0]

 def consume(self, *tokens):
    '''Returns the next token and consumes it, if it is in tokens.
      Raises an exception otherwise.
      If the token is a number or an identifier, its value is returned
      instead of the token.'''
    current = self.current_token

    if (len(self.input_string[self.current_char_index:]) == 0):
        self.current_token = (None, '')         # catches the end-of-file errors so lookahead returns none.
    else:
        self.current_token = self.get_token()   # otherwise we consume the token

    if current[0] in tokens:         # tokens could be a single token, or it could be group of tokens.
        if current[0] is Token.ID or current[0] is Token.NUM:     # if token is ID or NUM
            return current[1]                   # return the value of the ID or NUM
        else:                                   # otherwise
            return current[0]                   # return the token
    else:                                       # if current_token is not in tokens
        raise Exception('non-token detected')   # raise non-token error

class Token:
 # The following enumerates all tokens.
 DO    = 'DO'
 ELSE  = 'ELSE'
 READ  = 'READ'
 WRITE = 'WRITE'
 END   = 'END'
 IF    = 'IF'
 THEN  = 'THEN'
 WHILE = 'WHILE'
 SEM   = 'SEM'
 BEC   = 'BEC'
 LESS  = 'LESS'
 EQ    = 'EQ'
 GRTR  = 'GRTR'
 LEQ   = 'LEQ'
 NEQ   = 'NEQ'
 GEQ   = 'GEQ'
 ADD   = 'ADD'
 SUB   = 'SUB'
 MUL   = 'MUL'
 DIV   = 'DIV'
 LPAR  = 'LPAR'
 RPAR  = 'RPAR'
 NUM   = 'NUM'
 ID    = 'ID'

 # The following list gives the regular expression to match a token.
 # The order in the list matters for mimicking Flex behaviour.
 # Longer matches are preferred over shorter ones.
 #  For same-length matches, the first in the list is preferred.
 token_regexp = [
  (DO,    'do'),
  (ELSE,  'else'),
  (READ,    'read'),
  (WRITE,  'write'),
  (END,   'end'),
  (IF,    'if'),
  (THEN,  'then'),
  (WHILE, 'while'),
  (SEM,   ';'),
  (BEC,   ':='),
  (LESS,  '<'),
  (EQ,    '='),
  (NEQ,    '!='),
  (GRTR,  '>'),
  (LEQ,   '<='),
  (GEQ,   '>='),
  (ADD,   '[+]'), # + is special in regular expressions
  (SUB,   '-'),
  (MUL,   '[*]'),
  (DIV,   '[/]'),
  (LPAR,  '[(]'), # ( is special in regular expressions
  (RPAR,  '[)]'), # ) is special in regular expressions
  (ID,    '[a-z]+'),
  (NUM,   '[0-9]+'),
]

 def indent(s, level):
   return '    '*level + s + '\n'

# Initialise scanner.

scanner = Scanner(sys.stdin)

# Show all tokens in the input.

token = scanner.lookahead()
test = ''

while token != None:
 if token in [Token.NUM, Token.ID]:
   token, value = scanner.consume(token)
   print(token, value)
 else:
   print(scanner.consume(token))
 token = scanner.lookahead()

Sorry if this is poorly explained. Any help on what is going wrong would be wonderful. Thanks.


Solution

  • Solution 1a

    I figured out why it was not printing to the file tokens. I needed to change my test code to this

    while token != None:
     print(scanner.consume(token))
     token = scanner.lookahead()
    

    the only problem now is I cannot read when it is an ID or a NUM, it only prints out the identifies or the number without stating which it is. Right now, it prints out this:

    z
    BEC
    2
    SEM
    IF
    z
    LESS
    3
    THEN
    z
    BEC
    1
    END

    And I need it to print out this

    NUM z
    BEC
    ID 2
    SEM
    IF
    ID z
    LESS
    NUM 3
    THEN
    ID z
    BEC
    NUM 1
    END

    I am thinking of adding an if statement which states that if it's a NUM, then print NUM followed by the token, and likewise for if it's an ID.

    Solution 1b

    I simply added an if and elif statement to consume to print NUM and ID. For example, If current[0] is Token.ID then return "ID " + current[1].