Search code examples
parsingracketlexerlexical-analysis

(Building a lexer in Racket) How to identify & tokenize a line number that comes after a "gosub" statement


I am new to racket and am building a Lexer using the parser-tools/lex module and want to be able to tokenize a number that comes after a 'gosub' statement as a line number token. I am having trouble trying to figure out how to identify it as a line number, and not a regular number. I am reading in .txt files that look like this:

10 read A 20 read B 30 gosub 400 40 if C = 400 then write C 400 C = A + B : return $$

the "400" in "gosub 400" gets read as a number token.

I want this exepected output for my gosub token and line num token: ... token-GOSUB: GOSUB token-line-num: 400 ...

`#lang racket
;;; IMPORT
;; Import the lexer tools 
(require parser-tools/lex
         (prefix-in : parser-tools/lex-sre)  ; names from lex-sre are prefixed with :
         ;                                     to avoid name collisions
         )

;;; REGULAR EXPRESSIONS

;; Names for regular expressions matching letters and digits.
;; Note that :or are prefixed with a : due to (prefix-in : ...) above
(define-lex-abbrevs
  [read       "read"]
  [write      "write"]
  [goto       "goto"]
  [gosub       "gosub"]
  [line-num     (:/ #\1 #\9)]
  [letter     (:or (:/ "a" "z") (:/ #\A #\Z) "?" "!")]
  [digit        (:/ #\0 #\9)]
  [mult-op    (or "*" "/")]
  [add-op     (or "+" "-")]
  [end-of-file "$$"]
  [paren-start "("]
  [paren-end  ")"]
  )

;;; TOKENS

;; Tokens such as numbers (and identifiers and strings) carry a value
;; In the example only the NUMBER token is used, but you may need more.
(define-tokens value-tokens (NUMBER END-OF-PROGRAM READ WRITE GOTO GOSUB LINE-NUM IDENTIFIER MULT-OP ADD-OP PAREN-START PAREN-END))

;; Tokens that don't carry a value.
(define-empty-tokens op-tokens (newline :=  = < > ^ \( \) EOF))

;;; LEXER

;; The construct lexer-src-pos evaluates to a function which scans an input port
;; returning one position-token at a time.

;; A position token contains besides the actual token also source location information
;; (i.e. you can see where in the file the token was read)

(define lex
  (lexer-src-pos
    [(eof)                                          ; input: eof of file     
    'EOF]                                           ; output: the symbol EOF

    [(:+ end-of-file)
    (token-END-OF-PROGRAM (string->symbol lexeme))]

    [(:or #\tab #\space #\newline)                  ; input: whitespace
    (return-without-pos (lex input-port))]          ; output: the next token
   ;                                                  (i.e. skip the whitespace)

    ["\r"                                           ; input: newline
    (token-newline)]                                ; ouput: a newline-token   
   ;                                                ; note:  (token-newline) returns 'newline

    [(:or ":" ":=" "^" "<" ">" "=")                 ; input:  an operator
    (string->symbol lexeme)]                        ; output: corresponding symbol

    [(:or "+" "-")                                  ; input: "+" or "-"
    (token-ADD-OP (string->symbol lexeme))]         ; ouput: an ADD-OP token

    [(:or "*" "/")                                  ; input: "*" or "/"
    (token-MULT-OP (string->symbol lexeme))]        ; output: a MULT-OP token

    [(:+ digit)                                     ; input:  digits
    (token-NUMBER (string->number lexeme))]         ; outout: a NUMBER token whose value is the number

    [(:+ read)                                      ; input: the string "read"
    (token-READ lexeme)]                            ; output: READ token

    [(:+ write)                                     ; input: the string "write"
    (token-WRITE lexeme)]                           ; output: WRITE token

    [(:+ goto)                                      
    (token-GOTO lexeme)]
    
    [(:+ gosub)                                      ; match "gosub" followed by one or more digits
    (token-GOSUB lexeme)]


**    [(:+ line-num)                                      ; match "gosub" followed by one or more digits
    (token-LINE-NUM lexeme)] **
    

    [(:+ letter)                                    ; input: Alphabetic letter
    (token-IDENTIFIER lexeme)]                      ; output: IDENTIFIER token whose value is the word

    [(:+ paren-start)                               ; input: (
    (token-PAREN-START lexeme)]                     ; output: PAREN-START token 

    [(:+ paren-end)                                 ; input: )
    (token-PAREN-END lexeme)]                       ; output: PAREN-END token 
  ))



(define (string->tokens s)
  (port->tokens (open-input-file s)))

(define (port->tokens in)
  (define token (lex in))
  (if (eq? (position-token-token token) 'EOF)
      '()
      (cons token (port->tokens in))))

(provide string->tokens)`

I have tried used regexp, but am not really sure how it properly use it when working with tokens from the parser-tools/lex in racket. It did not result in anything and simply returned as a number again.


Solution

  • I added a new abbreviation:

    [gosub+ (concatenation "gosub " (repetition 0 +inf.0 digit))]

    If this string is found, is parsed with its own function:

    (define (gosub-tokens in)
      (let ((token (gosub-lex in)))
        (if (eq? (position-token-token token) 'EOF) '()
            (cons token (gosub-tokens in)))))
    

    and its own lex:

    (define gosub-lex
      (lexer-src-pos
       [(eof)                                         
        'EOF]
       [(:+ gosub)                                     
        (token-GOSUB lexeme)]
       [(:or #\tab #\space #\newline)                  
        (return-without-pos (gosub-lex input-port))]
       [(:+ digit)                                    
        (token-LINE-NUM (string->number lexeme))]))
    

    The full code:

    #lang racket
    (require parser-tools/lex
             (prefix-in : parser-tools/lex-sre))
    
    ;;; REGULAR EXPRESSIONS
    
    ;; Names for regular expressions matching letters and digits.
    ;; Note that :or are prefixed with a : due to (prefix-in : ...) above
    (define-lex-abbrevs
      [read       "read"]
      [write      "write"]
      [goto       "goto"]
      [digit      (:/ #\0 #\9)]
      [gosub      "gosub"]
      [gosub+     (concatenation "gosub " (repetition 0 +inf.0 digit))]
      [letter     (:or (:/ "a" "z") (:/ #\A #\Z) "?" "!")]
      [mult-op    (or "*" "/")]
      [add-op     (or "+" "-")]
      [end-of-file "$$"]
      [paren-start "("]
      [paren-end  ")"])
    
    ;;; TOKENS
    
    (define-tokens value-tokens (NUMBER END-OF-PROGRAM READ WRITE GOTO GOSUB GOSUB+ LINE-NUM IDENTIFIER MULT-OP ADD-OP PAREN-START PAREN-END))
    
    (define-empty-tokens op-tokens (newline :=  = < > ^ \( \) EOF))
    
    ;;; LEXER
    
    (define gosub-lex
      (lexer-src-pos
       [(eof)                                         
        'EOF]
       [(:+ gosub)                                     
        (token-GOSUB lexeme)]
       [(:or #\tab #\space #\newline)                  
        (return-without-pos (gosub-lex input-port))]
       [(:+ digit)                                    
        (token-LINE-NUM (string->number lexeme))]))
    
    (define lex
      (lexer-src-pos
        [(eof)                                         
        'EOF]                                           
        [(:+ end-of-file)
        (token-END-OF-PROGRAM (string->symbol lexeme))]
        [(:or #\tab #\space #\newline)                  
        (return-without-pos (lex input-port))]                                       
        ["\r"                                          
        (token-newline)]                               
        [(:or ":" ":=" "^" "<" ">" "=")                 
        (string->symbol lexeme)]                      
        [(:or "+" "-")                                 
        (token-ADD-OP (string->symbol lexeme))]        
        [(:or "*" "/")                                 
        (token-MULT-OP (string->symbol lexeme))]       
        [(:+ digit)                                    
        (token-NUMBER (string->number lexeme))]        
        [(:+ read)                                     
        (token-READ lexeme)]                           
        [(:+ write)                                    
        (token-WRITE lexeme)]                          
        [(:+ goto)                                      
        (token-GOTO lexeme)]
        [(:+ gosub+)                                     
        (token-GOSUB+ lexeme)]
        [(:+ letter)                                  
        (token-IDENTIFIER lexeme)]                     
        [(:+ paren-start)                              
        (token-PAREN-START lexeme)]                     
        [(:+ paren-end)                                
        (token-PAREN-END lexeme)]))                      
    
    (define (string->tokens s)
      (port->tokens (open-input-file s)))
    
    (define (gosub-tokens in)
      (let ((token (gosub-lex in)))
        (if (eq? (position-token-token token) 'EOF) '()
            (cons token (gosub-tokens in)))))
            
    (define (port->tokens in)
      (let ((token (lex in)))
        (cond ((eq? (position-token-token token) 'EOF) '())
              ((eq? (token-name (position-token-token token)) 'GOSUB+)
               (append (gosub-tokens (open-input-string (token-value (position-token-token token))))
                       (port->tokens in)))
              (else (cons token (port->tokens in))))))
    
    (provide string->tokens)