Search code examples
ocamlocamllex

token meaning dependent on context


I have a weird string syntax where the meaning of a delimiter depends on context. In the following sample input:

( (foo) (bar) )

the result is a list of two strings ["foo"; "bar"]. The outer pair of parenthesis enters list mode. Then, the next pair of parentheses delimits the string. Inside strings, balanced pairs of parentheses are to be treated as part of the string.

Right now the lexer decides what to return depending on a global variable inside.

{
  open Sample_parser
  exception Error of string
  let inside = ref false (* <= to be eliminated *)
}

The delimiters are parentheses. If the lexer hits an opening parenthesis, then

  • if inside is false, it emits an Enter token and inside is set to true.
  • If inside is true, it switches to a string lexer which treats any properly nested pair of parentheses as part of the string. If the nesting level returns to zero, the string buffer is passed to the parser.

If a closing parenthesis is encountered outside a string, a Leave token is emitted and inside is unset.

My question is: How do I rewrite the lexer without the global variable inside?

Fwiw I use menhir but afaict the same would be true for ocamlyacc. (Sorry if this sounds confused, I’m really a newbie to the yacc/lex approach. I can express all the above without thinking as a PEG but I haven’t got used to mentally keeping lexer and parser separated. Feel free to point out other issues with the code!)

Simple example: *sample_lexer.mll*

{
  open Sample_parser
  exception Error of string
  let inside = ref false (* <= to be eliminated *)
}

let lpar  = "("
let rpar  = ")"
let ws    = [' ' '\t' '\n' '\r']

rule tokenize = parse
  | ws    { tokenize lexbuf }
  | lpar  { if not !inside then begin
              inside := true;
              Enter
            end else begin
              let buf = Buffer.create 20 in
              String (string_scanner
                        (Lexing.lexeme_start lexbuf)
                        0
                        buf
                        lexbuf)
            end }
  | rpar  { inside := false; Leave }
and string_scanner init depth buf = parse
  | rpar  { if depth = 0 then begin
              Buffer.contents buf;
            end else begin
              Buffer.add_char buf ')';
              string_scanner init (depth - 1) buf lexbuf end }
  | lpar  { Buffer.add_char buf '(';
            string_scanner init (depth + 1) buf lexbuf }
  | eof   { raise (Error (Printf.sprintf
                           "Unexpected end of file inside string, pos %d--%d]!\n"
                           init
                           (Lexing.lexeme_start lexbuf))) }
  | _ as chr { Buffer.add_char buf chr;
               string_scanner init depth buf lexbuf }

*sample_scanner.mly*:

%token <string> String
%token Enter
%token Leave

%start <string list> process

%%

process:
  | Enter lst = string_list Leave { lst }

string_list:
  | elm = element lst = string_list { elm :: lst }
  | elm = element                   { [elm]      }

element:
  | str = String { str }

main.ml:

open Batteries

let sample_input = "( (foo (bar) baz) (xyzzy) )"
(*                  EibssssssssssssseibssssseiL
 * where E := enter inner
 *       L := leave inner
 *       i := ignore (whitespace)
 *       b := begin string
 *       e := end string
 *       s := part of string
 *
 * desired result: [ "foo (bar) baz"; "xyzzy" ] (type string list)
 *)

let main () =
  let buf = Lexing.from_string sample_input in
  try
    List.print
      String.print stdout
      (Sample_parser.process Sample_lexer.tokenize buf);
    print_string "\n";
  with
  | Sample_lexer.Error msg   -> Printf.eprintf "%s%!" msg
  | Sample_parser.Error      -> Printf.eprintf
                                    "Invalid syntax at pos %d.\n%!"
                                    (Lexing.lexeme_start buf)

let _ = main ()

Solution

  • You can pass the state as an argument to tokenize. It still has to be mutable, but not global.

    rule tokenize inside = parse
      | ws    { tokenize inside lexbuf }
      | lpar  { if not !inside then begin
                  inside := true;
                  Enter
                end else begin
                  let buf = Buffer.create 20 in
                  String (string_scanner
                            (Lexing.lexeme_start lexbuf)
                            0
                            buf
                            lexbuf)
                end }
      | rpar  { inside := false; Leave }
    

    And you call the parser as follows:

    Sample_parser.process (Sample_lexer.tokenize (ref false)) buf