I'm making a compiler for a C-like language that has to support #include directive (only in the beginning of the file)
A simple but inelegant approach would be to create a subprogram that finds every occurrence of the directive, and substitutes with the corresponding file in a new temporary file.
Now that is not nice at all. So i tried the following:
lexer = parse
| "#include \"" ( [^'"' '\n']* as filename) '"'
{ lexer (Lexing.from_channel (open_in filename)) ; lexer lexbuf }
the idea was the following: whenever you find an include, open a new channel using the given filename, and call recursively the "lexer" rule on that channel. After that, continue with the current state of your lexing-buffer and continue the lexing.
The problem is, it never worked.
i have also seen that it's possible to make a refiller when the buffer lexbuf has reached eof. But i failed to find more info. That gave me an idea of changing the above code to the following:
lexer = parse
| "#include \"" ( [^'"' '\n']* as filename) '"'
{ addCurrentLexBufToAStack lexbuf ;lexer (Lexing.from_channel (open_in filename)); }
and in the refiller, you would continue from the head of the stack
but seems very ambitious to work.
Any ideas?
P.s. The lexer (and parsing too) is called from another module (let's call it Main.ml)
Well, aren't you a bit confused about lexing and parsing ?
What I see is that :
If my lexeme is #include ident I want to parse what's in the file pointed by ident and add it.
You are then confusing parsing and lexing
You could write something like this : (it's a small program but it works ;-))
type operation =
| Plus of operation * operation
| Minus of operation * operation
| Int of int
type prog = string list * operation list
{
open Parser
open Lexing
open Ast
let current_pos b =
lexeme_start_p b,
lexeme_end_p b
}
let newline = '\n'
let space = [' ' '\t' '\r']
let digit = ['0' - '9']
let integer = digit+
rule token = parse
| newline { token lexbuf}
| space+ { token lexbuf}
| "#include \"" ( [^'"' '\n']* as filename) '"' { INCLUDE filename }
| integer as i { INTEGER (int_of_string i) }
| "+" { PLUSI }
| "-" { MINUSI }
| ";" { SC }
| "main" { MAIN }
| eof
{ EOF }
%{
open Ast
%}
%token <string> INCLUDE
%token EOF SC
%token PLUSI
%token MINUSI
%token MAIN
%token <int> INTEGER
%left PLUSI MINUSI
%start <Ast.prog> prog
%%
prog:
include_list MAIN operations EOF { ($1, $3) }
include_list:
| { [] }
| INCLUDE include_list { $1 :: $2 }
operation:
| operation PLUSI operation { Plus ($1, $3) }
| operation MINUSI operation { Minus ($1, $3) }
| INTEGER { Int $1 }
operations:
| operation { [$1] }
| operation SC operations { $1 :: $3 }
So, as you can see, when I parse I remember the filenames I have to parse and
open Lexing
open Ast
let rec print_op fmt op =
match op with
| Plus (op1, op2) ->
Format.fprintf fmt "(%a + %a)"
print_op op1 print_op op2
| Minus (op1, op2) ->
Format.fprintf fmt "(%a - %a)"
print_op op1 print_op op2
| Int i -> Format.fprintf fmt "%d" i
let rec read_includes fl =
List.fold_left (fun acc f ->
let c = open_in f in
let lb = Lexing.from_channel c in
let fl, p = Parser.prog Lexer.token lb in
close_in c;
let acc' = read_includes fl in
acc' @ p
) [] fl
let () =
try
let p = read_includes [Sys.argv.(1)] in
List.iter (Format.eprintf "%a@." print_op) p
with _ -> Format.eprintf "Bad Boy !@."
Which means that when I finished parsing the first file I parse the files included.
The most important thing is your confusion about lexing (which is the dumbest thing in a compiler, you just ask "What is the next token that you see ?" and he answers "I see #include "filename"
" and the parser which is not that dumb and says "Hey, the lexer saw #include "filename"
so I will keep in mind this filename because I may need it and I will keep going.
And if I have these three files :
#include "file2"
main
6; 7
#include "file3"
main
4; 5
main
1; 2; 3
If I call ./compile file1
I have the output 1 2 3 4 5 6
which is what I want. ;-)
New version with the lexer handling the includes :
type operation =
| Plus of operation * operation
| Minus of operation * operation
| Int of int
type prog = operation list
{
open Parser
let fset = Hashtbl.create 17
(* set keeping all the filenames *)
}
let newline = '\n'
let space = [' ' '\t' '\r']
let digit = ['0' - '9']
let integer = digit+
rule token = parse
| newline { token lexbuf}
| space+ { token lexbuf}
| "#include \"" ( [^'"' '\n']* as filename) '"'
{ if Hashtbl.mem fset filename then
raise Exit
else
let c = open_in filename in
Hashtbl.add fset filename ();
let lb = Lexing.from_channel c in
let p = Parser.prog token lb in
INCLUDE p
}
| integer as i { INTEGER (int_of_string i) }
| "+" { PLUSI }
| "-" { MINUSI }
| ";" { SC }
| "main" { MAIN }
| eof
{ EOF }
%{
open Ast
%}
%token <Ast.prog> INCLUDE
%token EOF SC
%token PLUSI
%token MINUSI
%token MAIN
%token <int> INTEGER
%left PLUSI MINUSI
%start <Ast.prog> prog
%%
prog:
include_list MAIN operations EOF { List.rev_append (List.rev $1) $3 }
include_list:
| { [] }
| INCLUDE include_list { List.rev_append (List.rev $1) $2 }
operation:
| operation PLUSI operation { Plus ($1, $3) }
| operation MINUSI operation { Minus ($1, $3) }
| INTEGER { Int $1 }
operations:
| operation { [$1] }
| operation SC operations { $1 :: $3 }
open Lexing
open Ast
let rec print_op fmt op =
match op with
| Plus (op1, op2) ->
Format.fprintf fmt "(%a + %a)"
print_op op1 print_op op2
| Minus (op1, op2) ->
Format.fprintf fmt "(%a - %a)"
print_op op1 print_op op2
| Int i -> Format.fprintf fmt "%d" i
let () =
try
let c = open_in Sys.argv.(1) in
let lb = Lexing.from_channel c in
let p = Parser.prog Lexer.token lb in
close_in c;
List.iter (Format.eprintf "%a@." print_op) p
with _ -> Format.eprintf "Bad Boy !@."
So, in the lexer, when I see a #include filename
I call immediately the Parser on the file linked by filename
and returns the Ast.prog
parsed to the previous call of parsing.
I hope it's all clear for you ;-)
I can't let this code like this, I edited it to avoid include loops (in lexer.mll) ;-)