Skip to content

Instantly share code, notes, and snippets.

@neel-krishnaswami
Last active September 6, 2023 13:43
Show Gist options
  • Save neel-krishnaswami/b1594c57433b7df2a143634a2fff3544 to your computer and use it in GitHub Desktop.
Save neel-krishnaswami/b1594c57433b7df2a143634a2fff3544 to your computer and use it in GitHub Desktop.
A linear-time parser combinator library in Ocaml
module C : sig
type t
val empty : t
val one : char -> t
val union : t -> t -> t
val inter : t -> t -> t
val top : t
val mem : char -> t -> bool
val make : (char -> bool) -> t
val equal : t -> t -> bool
val negate : t -> t
val is_empty : t -> bool
val disjoint : t -> t -> bool
val fold : (char -> 'a -> 'a) -> 'a -> t -> 'a
val of_string : string -> t
end = struct
type t = bytes
let make f =
let open Int in
let open Char in
Bytes.init 32 (fun i ->
let b0 = shift_left (Bool.to_int (f (chr (i * 8 + 0)))) 0 in
let b1 = shift_left (Bool.to_int (f (chr (i * 8 + 1)))) 1 in
let b2 = shift_left (Bool.to_int (f (chr (i * 8 + 2)))) 2 in
let b3 = shift_left (Bool.to_int (f (chr (i * 8 + 3)))) 3 in
let b4 = shift_left (Bool.to_int (f (chr (i * 8 + 4)))) 4 in
let b5 = shift_left (Bool.to_int (f (chr (i * 8 + 5)))) 5 in
let b6 = shift_left (Bool.to_int (f (chr (i * 8 + 6)))) 6 in
let b7 = shift_left (Bool.to_int (f (chr (i * 8 + 7)))) 7 in
let (||) = logor in
Char.chr (b7 || b6 || b5 || b4 || b3 || b2 || b1 || b0))
let mem c s =
let b = (Char.code c) / 8 in
let i = (Char.code c) mod 8 in
let w = Char.code (Bytes.get s b) in
Int.logand (Int.shift_left 1 i) w > 0
let empty = make (fun c -> false)
let top = make (fun c -> true)
let one c = make (fun c' -> c = c')
let union s1 s2 = make (fun c -> mem c s1 || mem c s2)
let inter s1 s2 = make (fun c -> mem c s1 && mem c s2)
let negate s = make (fun c -> not (mem c s))
let equal s1 s2 =
let rec loop i acc =
if i = 32 then
acc
else
loop (i+1) (acc && (Bytes.get s1 i = Bytes.get s2 i))
in
loop 0 true
let is_empty s = equal s empty
let disjoint s1 s2 = is_empty (inter s1 s2)
let fold f init s =
let rec loop i acc =
if i > 255 then
acc
else
let c = Char.chr i in
if mem c s then
loop (i+1) (f c acc)
else
loop (i+1) acc
in
loop 0 init
let of_string str =
let p c = String.contains str c in
make p
end
module Tp : sig
type t = { null : bool; first : C.t; follow : C.t }
exception TypeError of string
val char : char -> t
val eps : t
val seq : t -> t -> t
val charset : C.t -> t
val string : string -> t
val alt : t -> t -> t
val bot : t
val equal : t -> t -> bool
val fix : (t -> t) -> t
val print : Format.formatter -> t -> unit
end = struct
type t = {
null : bool;
first : C.t;
follow : C.t;
}
exception TypeError of string
let char c = {
null = false;
first = C.one c;
follow = C.empty;
}
let eps = {
null = true;
first = C.empty;
follow = C.empty;
}
let seq t1 t2 =
let separate t1 t2 =
not t1.null
&&
C.disjoint t1.follow t2.first
in
if separate t1 t2 then
{ null = false;
first = t1.first;
follow = C.union t2.follow (if t2.null then t1.follow else C.empty);
}
else
raise (TypeError "ambiguous sequencing")
let string s =
if String.length s = 0 then
eps
else
char s.[0]
let alt t1 t2 =
let nonoverlapping t1 t2 =
not (t1.null && t2.null)
&&
C.disjoint t1.first t2.first
in
if nonoverlapping t1 t2 then
{
null = t1.null || t2.null;
first = C.union t1.first t2.first;
follow = C.union t1.follow t2.follow;
}
else
raise (TypeError "ambiguous alternation")
let bot = {
null = false;
first = C.empty;
follow = C.empty;
}
let charset cs =
if C.is_empty cs then
bot
else
{ null = false;
first = cs;
follow = C.empty;
}
let equal t1 t2 =
t1.null = t2.null
&& C.equal t1.first t2.first
&& C.equal t1.follow t2.follow
let fix f =
let rec loop t =
let t' = f t in
if equal t t' then
t'
else
loop t'
in
loop bot
let print out t =
let p fmt = Format.fprintf out fmt in
let print_set cs =
C.fold (fun c () -> p "%c" c) () cs
in
let print_bool = function
| true -> p "true"
| false -> p "false"
in
p "{\n";
p " null = "; print_bool t.null; p ";\n";
p " first = C.of_string \""; print_set t.first; p "\";\n";
p " follow = C.of_string \""; print_set t.follow; p "\";\n";
p "}\n"
end
module Parser: sig
type 'a t
exception ParseFailure of int
val char : char -> unit t
val charset : C.t -> char t
val string : string -> unit t
val map : ('a -> 'b) -> 'a t -> 'b t
val (let+) : 'a t -> ('a -> 'b) -> 'b t
val seq : 'a t -> 'b t -> ('a * 'b) t
val (and+) : 'a t -> 'b t -> ('a * 'b) t
val eps : unit t
val return : 'a -> 'a t
val fail : 'a t
val any : 'a t list -> 'a t
val fix : ('a t -> 'a t) -> 'a t
val parse : 'a t -> string -> int -> (int * 'a)
end = struct
type 'a t = { tp : Tp.t; parse : string -> int -> int * 'a }
exception ParseFailure of int
let char c =
let p s i =
if i < String.length s && s.[i] = c then
(i+1, ())
else
raise (ParseFailure i)
in
{ tp = Tp.char c; parse = p }
let (let+) p f =
let p' s i =
let (i, v) = p.parse s i in
(i, f v)
in
{tp = p.tp; parse = p'}
let map f p = let+ x = p in f x
let (and+) p1 p2 =
let p' s i =
let (i, a) = p1.parse s i in
let (i, b) = p2.parse s i in
(i, (a,b))
in
{ tp = Tp.seq p1.tp p2.tp; parse = p' }
let seq = (and+)
let eps = { tp = Tp.eps; parse = fun s i -> (i, ()) }
let return x =
let+ () = eps in x
let charset cs =
let p s i =
if i < String.length s && C.mem s.[i] cs then
(i+1, s.[i])
else
raise (ParseFailure i)
in
{tp = Tp.charset cs; parse = p }
let string str =
let p s i =
if i + String.length str < String.length s then
let rec loop j =
if j < String.length str then
if str.[j] = s.[i + j] then
loop (j+1)
else
raise (ParseFailure (i+j))
else
(i+j, ())
in
loop 0
else
raise (ParseFailure i)
in
{tp = Tp.string str; parse = p}
let fail =
{ tp = Tp.bot;
parse = fun s i -> raise (ParseFailure i) }
let (||) p1 p2 =
let p' s i =
if i < String.length s then
if C.mem s.[i] p1.tp.Tp.first then
p1.parse s i
else if C.mem s.[i] p2.tp.Tp.first then
p2.parse s i
else if p1.tp.Tp.null then
p1.parse s i
else if p2.tp.Tp.null then
p2.parse s i
else
raise (ParseFailure i)
else if p1.tp.Tp.null then
p1.parse s i
else if p2.tp.Tp.null then
p2.parse s i
else
raise (ParseFailure i)
in
{tp = Tp.alt p1.tp p2.tp; parse = p' }
let any ps = List.fold_left (||) fail ps
let fix f =
let g t = (f {fail with tp = t}).tp in
let r = ref (fail.parse) in
let p = f {tp = (Tp.fix g); parse = fun s i -> !r s i} in
r := p.parse;
p
let parse p = p.parse
end
module Sexp = struct
open Parser
let letter = charset (C.of_string "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
let digit = charset (C.of_string "0123456789")
let whitespace = charset (C.of_string " \t\n")
let (==>) p f = let+ x = p in f x
let (>>) p1 p2 =
let+ x = p1
and+ _ = p2 in
x
let star p =
fix (fun r ->
any [ eps ==> (fun () -> []);
seq p r ==> (fun (x, xs) -> x :: xs)
])
let starskip p =
fix (fun r ->
any [ eps ==> (fun _ -> ());
seq p r ==> (fun _ -> ())
])
let symbol =
let+ c = letter
and+ cs = star letter
and+ _ = starskip whitespace
in
let b = Buffer.create 0 in
List.iter (Buffer.add_char b) (c :: cs);
Buffer.contents b
type sexp =
| Sym of string
| Seq of sexp list
let rec generate_list g (fuel : int) =
if (fuel = 0) then
[]
else if fuel = 1 then
[ g fuel ]
else
let i = Random.int fuel in (* Divide the fuel *)
let x = g i in
let xs = generate_list g (fuel - i) in
x :: xs
let paren p =
let+ () = char '(' >> starskip whitespace
and+ x = p
and+ () = char ')' >> starskip whitespace
in
x
let sexp =
fix (fun r ->
any [ symbol ==> (fun s -> Sym s);
paren (star r) ==> (fun xs -> Seq xs)
])
end
module Test = struct
open Sexp
(* This module randomly generates some huge s-expressions, and
then tries to parse them *)
let generate_symbol fuel =
Char.(escaped (chr (65 + Random.int 26)))
let rec generate_sexp fuel =
if fuel = 0 then
Seq []
else if fuel = 1 then
Sym (generate_symbol fuel)
else
Seq (generate_list generate_sexp fuel)
let rec print_sexp out = function
| Sym s -> Format.fprintf out "%s" s
| Seq xs -> Format.fprintf out "(%a)" print_sexps xs
and print_sexps out = function
| [] -> ()
| [s] -> print_sexp out s
| x :: xs -> Format.fprintf out "%a %a" print_sexp x print_sexps xs
let string_of_sexp sexp =
let b = Buffer.create 0 in
let out = Format.formatter_of_buffer b in
let () = print_sexp out sexp in
Buffer.contents b
let time f x =
let t = Sys.time () in
let _ = f x in
(Sys.time () -. t)
let s1000k = string_of_sexp (generate_sexp 1000000)
let s10M = string_of_sexp (generate_sexp 10000000)
let test str =
let len = String.length str in
let t = time (fun s -> Parser.parse sexp s 0) str in
let rate = (float_of_int len /. t) in begin
Printf.printf "String length: %d bytes\n" len;
Printf.printf "Parser elapsed time: %.3f sec\n" t;
Printf.printf "Parsing rate: %.3g bytes/sec\n\n" rate;
end
let _ = test s1000k
let _ = test s10M
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment