Search code examples
stringocamlsubstr

Exception: Invalid_argument "String.sub / Bytes.sub"


I wrote a tail recursive scanner for basic arithmetic expressions in OCaml

Syntax

Exp ::= n | Exp Op Exp | (Exp)

Op ::= + | - | * | /
type token =
| Tkn_NUM of int
| Tkn_OP of string
| Tkn_LPAR
| Tkn_RPAR
| Tkn_END

exception ParseError of string * string

let tail_tokenize s =
  let rec tokenize_rec s pos lt =
    if pos < 0 then lt
    else
      let c = String.sub s pos 1 in
      match c with
      | " " -> tokenize_rec s (pos-1) lt
      | "(" -> tokenize_rec s (pos-1) (Tkn_LPAR::lt)
      | ")" -> tokenize_rec s (pos-1) (Tkn_RPAR::lt)
      | "+" | "-" | "*" | "/" -> tokenize_rec s (pos-1) ((Tkn_OP c)::lt)
      | "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ->
        (match lt with
         | (Tkn_NUM n)::lt' -> 
           (let lta = Tkn_NUM(int_of_string (c^(string_of_int n)))::lt' in
            tokenize_rec s (pos-1) lta)
         | _ -> tokenize_rec s (pos-1) (Tkn_NUM (int_of_string c)::lt) 
         )
      |_ -> raise (ParseError ("Tokenizer","unknown symbol: "^c))
  in
  tokenize_rec s (String.length s) [Tkn_END]

During execution I get

tail_tokenize "3+4";;
Exception: Invalid_argument "String.sub / Bytes.sub".

Solution

  • Your example case is this:

    tail_tokenize "3+4"
    

    The first call will look like this:

    tokenize_rec "3+4" 3 Tkn_END
    

    Since 3 is not less than 0, the first call inside tokenize_rec will look like this:

    String.sub "3+4" 3 1
    

    If you try this yourself you'll see that it's invalid:

    # String.sub "3+4" 3 1;;
    Exception: Invalid_argument "String.sub / Bytes.sub".
    

    It seems a little strange to work through the string backwards, but to do this you need to start at String.length s - 1.