Created
September 9, 2015 16:45
-
-
Save lindig/e86111675025819cee43 to your computer and use it in GitHub Desktop.
Some Scanning Recipes for OCamlLex
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
(* short names for important modules *) | |
module L = Lexing | |
module B = Buffer | |
type token = | |
| STR of string | |
| INT of int | |
| ID of string | |
| PLUSEQ | |
| MINUSEQ | |
| STAREQ | |
| SLASHEQ | |
| PLUS | |
| MINUS | |
| STAR | |
| SLASH | |
| ASSIGN | |
| EOF (* end of input *) | |
let get = L.lexeme | |
let sprintf = Printf.sprintf | |
let position lexbuf = | |
let p = lexbuf.L.lex_curr_p in | |
sprintf "%s:%d:%d" | |
p.L.pos_fname p.L.pos_lnum (p.L.pos_cnum - p.L.pos_bol) | |
let set_filename (fname:string) (lexbuf:L.lexbuf) = | |
( lexbuf.L.lex_curr_p <- | |
{ lexbuf.L.lex_curr_p with L.pos_fname = fname } | |
; lexbuf | |
) | |
exception Error of string | |
let error lexbuf fmt = | |
Printf.kprintf (fun msg -> | |
raise (Error ((position lexbuf)^" "^msg))) fmt | |
} | |
let ws = [' ' '\t'] | |
let nl = ['\n'] | |
let digit = ['0'-'9'] | |
let alpha = ['a'-'z' 'A'-'Z'] | |
let id = alpha (alpha|digit)* | |
rule token = parse | |
| ws+ { token lexbuf } | |
| nl { L.new_line lexbuf; token lexbuf } | |
| digit+ { INT(int_of_string @@ get lexbuf) } | |
| id { ID(get lexbuf)} | |
| '+' { PLUS } | |
| '-' { MINUS } | |
| '*' { STAR } | |
| '/' { SLASH } | |
| "+=" { PLUSEQ } | |
| "-=" { MINUSEQ } | |
| "*=" { STAREQ } | |
| "/=" { SLASHEQ } | |
| ":=" { ASSIGN } | |
| '"' { STR (string (B.create 100) lexbuf) } (* see below *) | |
| eof { EOF } | |
| _ { error lexbuf | |
"found '%s' - don't know how to handle" @@ get lexbuf } | |
and escape b = parse | |
| '&' { Buffer.add_string b "&"; escape b lexbuf } | |
| '"' { Buffer.add_string b """; escape b lexbuf } | |
| '\'' { Buffer.add_string b "'"; escape b lexbuf } | |
| '>' { Buffer.add_string b ">"; escape b lexbuf } | |
| '<' { Buffer.add_string b "<"; escape b lexbuf } | |
| [^'&' '"' '\'' '>' '<']+ | |
{ Buffer.add_string b @@ get lexbuf | |
; escape b lexbuf | |
} | |
| eof { let x = Buffer.contents b in Buffer.clear b; x } | |
| _ { error lexbuf | |
"don't know how to quote: %s" (get lexbuf) } | |
and string buf = parse (* use buf to build up result *) | |
| [^'"' '\n' '\\']+ | |
{ B.add_string buf @@ get lexbuf | |
; string buf lexbuf | |
} | |
| '\n' { B.add_string buf @@ get lexbuf | |
; L.new_line lexbuf | |
; string buf lexbuf | |
} | |
| '\\' '"' { B.add_char buf '"' | |
; string buf lexbuf | |
} | |
| '\\' { B.add_char buf '\\' | |
; string buf lexbuf | |
} | |
| '"' { B.contents buf } (* return *) | |
| eof { error lexbuf "end of input inside of a string" } | |
| _ { error lexbuf | |
"found '%s' - don't know how to handle" @@ get lexbuf } | |
{ | |
let escape str = escape (B.create 100) (L.from_string str) | |
let to_string = function | |
| STR(str) -> sprintf "STR(%s)" (escape str) | |
| INT(d) -> sprintf "INT(%d)" d | |
| PLUS -> sprintf "PLUS" | |
| MINUS -> sprintf "MINUS" | |
| STAR -> sprintf "STAR" | |
| SLASH -> sprintf "SLASH" | |
| PLUSEQ -> sprintf "PLUSEQ" | |
| MINUSEQ -> sprintf "MINUSEQ" | |
| STAREQ -> sprintf "STAREQ" | |
| SLASHEQ -> sprintf "SLASHEQ" | |
| ID(str) -> sprintf "ID(%s)" str | |
| ASSIGN -> sprintf "ASSIGN" | |
| EOF -> sprintf "EOF" | |
let main () = | |
let lexbuf = set_filename "stdin" @@ L.from_channel stdin in | |
let rec loop acc = function | |
| EOF -> to_string EOF :: acc |> List.rev | |
| x -> loop (to_string x :: acc) (token lexbuf) | |
in | |
loop [] (token lexbuf) | |
|> String.concat " " | |
|> print_endline | |
let () = main () (* call main function on startup *) | |
} |
Sorry, I have not used Menhir and can't provide an answer without porting it to Menhir myself. From the documentation the same trick used here could be available: I am defining tokens
in the early part of file which is just copied to the generated. Such a header section is available in Menhir as well. The same trick is used at the bottom to define the main
function.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Awesome thanks for your sharing, it's good to read this example.
Just wondering how to use menhir with this gist? From my understanding, we should define the tokens in parser.mly, and use
open Parser
in thetokenizer.mll
file.In this gist, it seems that we define the tokens in tokenizer.mll, I did try to define a simple parser.mly like
And there's an error like:
$ dune build File "bin/parser.mly", line 12, characters 4-7: Error: EOF is undefined.