Last active
September 6, 2023 13:43
-
-
Save neel-krishnaswami/b1594c57433b7df2a143634a2fff3544 to your computer and use it in GitHub Desktop.
A linear-time parser combinator library in Ocaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module C : sig | |
type t | |
val empty : t | |
val one : char -> t | |
val union : t -> t -> t | |
val inter : t -> t -> t | |
val top : t | |
val mem : char -> t -> bool | |
val make : (char -> bool) -> t | |
val equal : t -> t -> bool | |
val negate : t -> t | |
val is_empty : t -> bool | |
val disjoint : t -> t -> bool | |
val fold : (char -> 'a -> 'a) -> 'a -> t -> 'a | |
val of_string : string -> t | |
end = struct | |
type t = bytes | |
let make f = | |
let open Int in | |
let open Char in | |
Bytes.init 32 (fun i -> | |
let b0 = shift_left (Bool.to_int (f (chr (i * 8 + 0)))) 0 in | |
let b1 = shift_left (Bool.to_int (f (chr (i * 8 + 1)))) 1 in | |
let b2 = shift_left (Bool.to_int (f (chr (i * 8 + 2)))) 2 in | |
let b3 = shift_left (Bool.to_int (f (chr (i * 8 + 3)))) 3 in | |
let b4 = shift_left (Bool.to_int (f (chr (i * 8 + 4)))) 4 in | |
let b5 = shift_left (Bool.to_int (f (chr (i * 8 + 5)))) 5 in | |
let b6 = shift_left (Bool.to_int (f (chr (i * 8 + 6)))) 6 in | |
let b7 = shift_left (Bool.to_int (f (chr (i * 8 + 7)))) 7 in | |
let (||) = logor in | |
Char.chr (b7 || b6 || b5 || b4 || b3 || b2 || b1 || b0)) | |
let mem c s = | |
let b = (Char.code c) / 8 in | |
let i = (Char.code c) mod 8 in | |
let w = Char.code (Bytes.get s b) in | |
Int.logand (Int.shift_left 1 i) w > 0 | |
let empty = make (fun c -> false) | |
let top = make (fun c -> true) | |
let one c = make (fun c' -> c = c') | |
let union s1 s2 = make (fun c -> mem c s1 || mem c s2) | |
let inter s1 s2 = make (fun c -> mem c s1 && mem c s2) | |
let negate s = make (fun c -> not (mem c s)) | |
let equal s1 s2 = | |
let rec loop i acc = | |
if i = 32 then | |
acc | |
else | |
loop (i+1) (acc && (Bytes.get s1 i = Bytes.get s2 i)) | |
in | |
loop 0 true | |
let is_empty s = equal s empty | |
let disjoint s1 s2 = is_empty (inter s1 s2) | |
let fold f init s = | |
let rec loop i acc = | |
if i > 255 then | |
acc | |
else | |
let c = Char.chr i in | |
if mem c s then | |
loop (i+1) (f c acc) | |
else | |
loop (i+1) acc | |
in | |
loop 0 init | |
let of_string str = | |
let p c = String.contains str c in | |
make p | |
end | |
module Tp : sig | |
type t = { null : bool; first : C.t; follow : C.t } | |
exception TypeError of string | |
val char : char -> t | |
val eps : t | |
val seq : t -> t -> t | |
val charset : C.t -> t | |
val string : string -> t | |
val alt : t -> t -> t | |
val bot : t | |
val equal : t -> t -> bool | |
val fix : (t -> t) -> t | |
val print : Format.formatter -> t -> unit | |
end = struct | |
type t = { | |
null : bool; | |
first : C.t; | |
follow : C.t; | |
} | |
exception TypeError of string | |
let char c = { | |
null = false; | |
first = C.one c; | |
follow = C.empty; | |
} | |
let eps = { | |
null = true; | |
first = C.empty; | |
follow = C.empty; | |
} | |
let seq t1 t2 = | |
let separate t1 t2 = | |
not t1.null | |
&& | |
C.disjoint t1.follow t2.first | |
in | |
if separate t1 t2 then | |
{ null = false; | |
first = t1.first; | |
follow = C.union t2.follow (if t2.null then t1.follow else C.empty); | |
} | |
else | |
raise (TypeError "ambiguous sequencing") | |
let string s = | |
if String.length s = 0 then | |
eps | |
else | |
char s.[0] | |
let alt t1 t2 = | |
let nonoverlapping t1 t2 = | |
not (t1.null && t2.null) | |
&& | |
C.disjoint t1.first t2.first | |
in | |
if nonoverlapping t1 t2 then | |
{ | |
null = t1.null || t2.null; | |
first = C.union t1.first t2.first; | |
follow = C.union t1.follow t2.follow; | |
} | |
else | |
raise (TypeError "ambiguous alternation") | |
let bot = { | |
null = false; | |
first = C.empty; | |
follow = C.empty; | |
} | |
let charset cs = | |
if C.is_empty cs then | |
bot | |
else | |
{ null = false; | |
first = cs; | |
follow = C.empty; | |
} | |
let equal t1 t2 = | |
t1.null = t2.null | |
&& C.equal t1.first t2.first | |
&& C.equal t1.follow t2.follow | |
let fix f = | |
let rec loop t = | |
let t' = f t in | |
if equal t t' then | |
t' | |
else | |
loop t' | |
in | |
loop bot | |
let print out t = | |
let p fmt = Format.fprintf out fmt in | |
let print_set cs = | |
C.fold (fun c () -> p "%c" c) () cs | |
in | |
let print_bool = function | |
| true -> p "true" | |
| false -> p "false" | |
in | |
p "{\n"; | |
p " null = "; print_bool t.null; p ";\n"; | |
p " first = C.of_string \""; print_set t.first; p "\";\n"; | |
p " follow = C.of_string \""; print_set t.follow; p "\";\n"; | |
p "}\n" | |
end | |
module Parser: sig | |
type 'a t | |
exception ParseFailure of int | |
val char : char -> unit t | |
val charset : C.t -> char t | |
val string : string -> unit t | |
val map : ('a -> 'b) -> 'a t -> 'b t | |
val (let+) : 'a t -> ('a -> 'b) -> 'b t | |
val seq : 'a t -> 'b t -> ('a * 'b) t | |
val (and+) : 'a t -> 'b t -> ('a * 'b) t | |
val eps : unit t | |
val return : 'a -> 'a t | |
val fail : 'a t | |
val any : 'a t list -> 'a t | |
val fix : ('a t -> 'a t) -> 'a t | |
val parse : 'a t -> string -> int -> (int * 'a) | |
end = struct | |
type 'a t = { tp : Tp.t; parse : string -> int -> int * 'a } | |
exception ParseFailure of int | |
let char c = | |
let p s i = | |
if i < String.length s && s.[i] = c then | |
(i+1, ()) | |
else | |
raise (ParseFailure i) | |
in | |
{ tp = Tp.char c; parse = p } | |
let (let+) p f = | |
let p' s i = | |
let (i, v) = p.parse s i in | |
(i, f v) | |
in | |
{tp = p.tp; parse = p'} | |
let map f p = let+ x = p in f x | |
let (and+) p1 p2 = | |
let p' s i = | |
let (i, a) = p1.parse s i in | |
let (i, b) = p2.parse s i in | |
(i, (a,b)) | |
in | |
{ tp = Tp.seq p1.tp p2.tp; parse = p' } | |
let seq = (and+) | |
let eps = { tp = Tp.eps; parse = fun s i -> (i, ()) } | |
let return x = | |
let+ () = eps in x | |
let charset cs = | |
let p s i = | |
if i < String.length s && C.mem s.[i] cs then | |
(i+1, s.[i]) | |
else | |
raise (ParseFailure i) | |
in | |
{tp = Tp.charset cs; parse = p } | |
let string str = | |
let p s i = | |
if i + String.length str < String.length s then | |
let rec loop j = | |
if j < String.length str then | |
if str.[j] = s.[i + j] then | |
loop (j+1) | |
else | |
raise (ParseFailure (i+j)) | |
else | |
(i+j, ()) | |
in | |
loop 0 | |
else | |
raise (ParseFailure i) | |
in | |
{tp = Tp.string str; parse = p} | |
let fail = | |
{ tp = Tp.bot; | |
parse = fun s i -> raise (ParseFailure i) } | |
let (||) p1 p2 = | |
let p' s i = | |
if i < String.length s then | |
if C.mem s.[i] p1.tp.Tp.first then | |
p1.parse s i | |
else if C.mem s.[i] p2.tp.Tp.first then | |
p2.parse s i | |
else if p1.tp.Tp.null then | |
p1.parse s i | |
else if p2.tp.Tp.null then | |
p2.parse s i | |
else | |
raise (ParseFailure i) | |
else if p1.tp.Tp.null then | |
p1.parse s i | |
else if p2.tp.Tp.null then | |
p2.parse s i | |
else | |
raise (ParseFailure i) | |
in | |
{tp = Tp.alt p1.tp p2.tp; parse = p' } | |
let any ps = List.fold_left (||) fail ps | |
let fix f = | |
let g t = (f {fail with tp = t}).tp in | |
let r = ref (fail.parse) in | |
let p = f {tp = (Tp.fix g); parse = fun s i -> !r s i} in | |
r := p.parse; | |
p | |
let parse p = p.parse | |
end | |
module Sexp = struct | |
open Parser | |
let letter = charset (C.of_string "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") | |
let digit = charset (C.of_string "0123456789") | |
let whitespace = charset (C.of_string " \t\n") | |
let (==>) p f = let+ x = p in f x | |
let (>>) p1 p2 = | |
let+ x = p1 | |
and+ _ = p2 in | |
x | |
let star p = | |
fix (fun r -> | |
any [ eps ==> (fun () -> []); | |
seq p r ==> (fun (x, xs) -> x :: xs) | |
]) | |
let starskip p = | |
fix (fun r -> | |
any [ eps ==> (fun _ -> ()); | |
seq p r ==> (fun _ -> ()) | |
]) | |
let symbol = | |
let+ c = letter | |
and+ cs = star letter | |
and+ _ = starskip whitespace | |
in | |
let b = Buffer.create 0 in | |
List.iter (Buffer.add_char b) (c :: cs); | |
Buffer.contents b | |
type sexp = | |
| Sym of string | |
| Seq of sexp list | |
let rec generate_list g (fuel : int) = | |
if (fuel = 0) then | |
[] | |
else if fuel = 1 then | |
[ g fuel ] | |
else | |
let i = Random.int fuel in (* Divide the fuel *) | |
let x = g i in | |
let xs = generate_list g (fuel - i) in | |
x :: xs | |
let paren p = | |
let+ () = char '(' >> starskip whitespace | |
and+ x = p | |
and+ () = char ')' >> starskip whitespace | |
in | |
x | |
let sexp = | |
fix (fun r -> | |
any [ symbol ==> (fun s -> Sym s); | |
paren (star r) ==> (fun xs -> Seq xs) | |
]) | |
end | |
module Test = struct | |
open Sexp | |
(* This module randomly generates some huge s-expressions, and | |
then tries to parse them *) | |
let generate_symbol fuel = | |
Char.(escaped (chr (65 + Random.int 26))) | |
let rec generate_sexp fuel = | |
if fuel = 0 then | |
Seq [] | |
else if fuel = 1 then | |
Sym (generate_symbol fuel) | |
else | |
Seq (generate_list generate_sexp fuel) | |
let rec print_sexp out = function | |
| Sym s -> Format.fprintf out "%s" s | |
| Seq xs -> Format.fprintf out "(%a)" print_sexps xs | |
and print_sexps out = function | |
| [] -> () | |
| [s] -> print_sexp out s | |
| x :: xs -> Format.fprintf out "%a %a" print_sexp x print_sexps xs | |
let string_of_sexp sexp = | |
let b = Buffer.create 0 in | |
let out = Format.formatter_of_buffer b in | |
let () = print_sexp out sexp in | |
Buffer.contents b | |
let time f x = | |
let t = Sys.time () in | |
let _ = f x in | |
(Sys.time () -. t) | |
let s1000k = string_of_sexp (generate_sexp 1000000) | |
let s10M = string_of_sexp (generate_sexp 10000000) | |
let test str = | |
let len = String.length str in | |
let t = time (fun s -> Parser.parse sexp s 0) str in | |
let rate = (float_of_int len /. t) in begin | |
Printf.printf "String length: %d bytes\n" len; | |
Printf.printf "Parser elapsed time: %.3f sec\n" t; | |
Printf.printf "Parsing rate: %.3g bytes/sec\n\n" rate; | |
end | |
let _ = test s1000k | |
let _ = test s10M | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment