Created
February 19, 2014 13:46
-
-
Save hkoba/9092301 to your computer and use it in GitHub Desktop.
Short S-expression parser in OCaml, take2. (This time with string support). Basically for my self training.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* -*- coding: utf-8 -*- *) | |
open Core.Std | |
module Util = struct | |
(** Perl5's defined-or operator *) | |
let (//) opt def = match opt with Some x -> x | None -> def | |
let min x y = if x < y then x else y | |
let apply_if opt fn v = | |
match opt with | |
| None -> v | |
| Some bound -> fn bound v | |
let cap_if opt v = | |
apply_if opt min v | |
end | |
module Strutil = struct | |
include Util | |
(** [compare left right] is an extended version of {!String.compare} | |
for string regions(I hope). | |
[~loff], [~roff] is where substring starts. | |
[~llen], [~rlen] is length to compare. | |
*) | |
let compare ?(loff=0) ?llen left ?(roff=0) ?rlen right = | |
let cmp l r = | |
if l < r then -1 | |
else if l > r then 1 | |
else 0 | |
in | |
let rec loop left loff llen right roff rlen i len = | |
if i >= len then | |
cmp llen rlen | |
else | |
let n = cmp left.[loff + i] right.[roff + i] in | |
if n <> 0 then | |
n | |
else | |
loop left loff llen right roff rlen | |
(i+1) len | |
in | |
let llen = cap_if llen (String.length left - loff) | |
and rlen = cap_if rlen (String.length right - roff) in | |
loop left loff llen right roff rlen | |
0 (min llen rlen) | |
end | |
module Strrange = struct | |
include Util | |
module SU = Strutil | |
type t = {base: string; mutable start: int; mutable finish: int} | |
let create ?(start=0) ?finish ?(empty=false) str = | |
{base = str; start = start; | |
finish = | |
if empty then start | |
else min (finish // String.length str) (String.length str)} | |
let of_string str = create str | |
let to_string ?(offset=0) {base; start; finish} = | |
let start = start + offset in | |
String.sub base ~pos:start ~len:(finish - start) | |
let copy ?start ?finish range = | |
{range with start = start // range.start; finish = finish // range.finish} | |
let deep_copy {base; start; finish} = | |
{base = String.copy base; start = start; finish = finish} | |
let length {start; finish; _} = finish - start | |
(*let is_empty {start; finish; _} = start >= finish*) | |
let get ?(offset=0) range = | |
range.base.[range.start + offset] | |
let compare ?len left right = | |
SU.compare | |
~loff:left.start ~llen:(cap_if len (length left)) left.base | |
~roff:right.start ~rlen:(cap_if len (length right)) right.base | |
end | |
module Strcursor = struct | |
module SU = Strutil | |
type t = {subject: string; mutable pos: int} | |
let create ?(pos=0) subject = {subject; pos} | |
let of_string ?(pos=0) subject = {subject; pos} | |
let to_subject_pos {subject; pos} = subject, pos | |
let pos sc = sc.pos | |
let to_string ?(start=0) sc = | |
String.sub sc.subject ~pos:start ~len:(sc.pos - start) | |
let to_strrange ?(start=0) {subject; pos} = | |
Strrange.create ~start ~finish:pos subject | |
let peek ?(off=0) cursor = | |
cursor.subject.[cursor.pos + off] | |
let length {subject; pos} = | |
String.length subject - pos | |
let can_peek ?(len=1) cursor = | |
length cursor - len >= 0 | |
let empty cursor = | |
not (can_peek cursor) | |
let advance ?(len=1) cursor = | |
cursor.pos <- cursor.pos + len; | |
true (* XXX: maybe bad habit *) | |
let advance_while test cursor = | |
while can_peek cursor && test (peek cursor); do | |
ignore (advance cursor) | |
done; | |
pos cursor | |
let compare str cursor = | |
SU.compare str ~roff:cursor.pos cursor.subject | |
exception End_of_cursor of t | |
let get ?(skip=0) cursor = | |
if not (can_peek ~len:(skip+1) cursor) then | |
raise (End_of_cursor cursor) | |
else | |
(if skip > 0 then | |
ignore (advance ~len:skip cursor); | |
let ch = peek cursor in | |
ignore (advance cursor); | |
ch) | |
end | |
module Strtok = struct | |
include Strrange | |
module SC = Strcursor | |
let space c = c = ' ' || c = '\t' || c = '\n' | |
let non_space c = not (space c) | |
let next_token sc = | |
let start = SC.advance_while space sc in | |
if not (SC.can_peek sc) then | |
None | |
else | |
(ignore (SC.advance_while non_space sc); | |
Some (SC.to_strrange ~start:start sc)) | |
let token_list_of_string str = | |
let rec loop sc lst = | |
(*Printf.printf "pos %d in subj %s\n" (SC.pos sc) sc.SC.subject;*) | |
match next_token sc with | |
| None -> List.rev lst | |
| Some tok -> loop sc (tok :: lst) | |
in | |
let sc = SC.of_string str in | |
loop sc [] | |
end | |
module CharRangeList = struct | |
module ST = Strtok | |
module Spec = struct | |
type t = {first: int; last: int} | |
exception Bad_charrange of Strrange.t | |
let create first last = | |
{first = first; last = last} | |
let of_strrange sr = | |
let ord = int_of_char in | |
let len = ST.length sr in | |
if len = 1 then | |
let n = ord (ST.get sr) in | |
create n n | |
else if ST.get sr = '-' then | |
let n = int_of_string (ST.to_string ~offset:1 sr) in | |
create n n | |
else if len = 3 && (ST.get ~offset:1 sr) = '-' then | |
create (ord (ST.get sr)) (ord (ST.get ~offset:2 sr)) | |
else | |
(* XXX: Is this good? *) | |
raise (Bad_charrange sr) | |
end | |
type t = Spec.t list | |
let of_string str = | |
List.map ~f:Spec.of_strrange (ST.token_list_of_string str) | |
end | |
module CharClass = struct | |
type t = bool array | |
module CRL = CharRangeList | |
let create ?(init=false) () = | |
Array.create ~len:256 init | |
(*Without ~len, I got: | |
Warning 6: labels were omitted in the application of this function. | |
*) | |
let invert cc = | |
Array.map ~f:(fun b -> not b) cc | |
let set_charrange cc ?(value=true) cr = | |
for i = cr.CRL.Spec.first to cr.CRL.Spec.last do | |
cc.(i) <- value | |
done; | |
cc | |
let of_string str = | |
let cc = create () | |
and ccdefs = CRL.of_string str in | |
List.iter ccdefs ~f:(fun def -> ignore (set_charrange cc def)); | |
cc | |
let contains cc ch = cc.(int_of_char ch) | |
end | |
module Strmatch = struct | |
module SC = Strcursor | |
module CC = CharClass | |
let char ch sc = | |
SC.peek sc = ch && SC.advance sc | |
let string string sc = | |
SC.compare string sc = 0 | |
&& SC.advance ~len:(String.length string) sc | |
let can_advance_and_match cc sc = | |
SC.can_peek sc | |
&& CC.contains cc (SC.peek sc) | |
let one cc sc = | |
can_advance_and_match cc sc | |
&& SC.advance sc | |
let many cc sc = | |
let nmatch = ref 0 in | |
while can_advance_and_match cc sc do | |
ignore (SC.advance sc); incr nmatch | |
done; | |
!nmatch > 0 | |
end | |
module Sexparse1 = struct | |
module SC = Strcursor | |
module SR = Strrange | |
module SM = Strmatch | |
module CC = CharClass | |
let cc_atom = CC.of_string "_ A-Z a-z 0-9 = ! @ $ % & * < > ? + - * / :" | |
let cc_ws = CC.of_string "-9 -32" | |
let cc_digit = CC.of_string "0-9" | |
let cc_plmin = CC.of_string "+ -" | |
let cc_in_string = CC.invert (CC.of_string "\" \\") | |
let skip_ws sc = ignore (SM.many cc_ws sc) | |
type sexp = Symbol of string | Int of int | List of sexp list | |
| String of string | |
exception Syntax_error of string * SC.t | |
let read ?(start=0) str = | |
let rec _read cursor = | |
skip_ws cursor; | |
let start = SC.pos cursor in | |
if SC.empty cursor then | |
List [] | |
else if SM.char '(' cursor; then | |
List (_list cursor []) | |
else if SM.char '\'' cursor; then | |
List ((Symbol "quote") :: [_read cursor]) | |
else if SM.char '\"' cursor; then | |
String (_string cursor) | |
else if ignore (SM.char '-' cursor); SM.many cc_digit cursor then | |
Int (int_of_string (SC.to_string ~start cursor)) | |
else if SM.many cc_atom cursor then | |
Symbol (SC.to_string ~start cursor) | |
else | |
(* XXX: proper error reporting *) | |
raise (Syntax_error ("Invalid character", cursor)) | |
and _list cursor res = | |
skip_ws cursor; | |
if SM.char ')' cursor then | |
List.rev res | |
else | |
_list cursor (_read cursor :: res) | |
and _string cursor = | |
let rec loop cursor lst = | |
let start = SC.pos cursor in | |
if SM.many cc_in_string cursor then | |
loop cursor (SC.to_string ~start cursor :: lst) | |
else if SM.char '"' cursor then | |
String.concat (List.rev lst) | |
else if SM.char '\\' cursor then | |
if SC.can_peek cursor then | |
loop cursor (String.of_char (SC.get cursor) :: lst) | |
else | |
raise (Syntax_error ("String ended shortly", cursor)) | |
else | |
raise (Syntax_error ("Can't find string terminator", cursor)) | |
in | |
loop cursor [] | |
in | |
let cursor = (SC.of_string ~pos:start str) in | |
let res = _read cursor in | |
res, cursor | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* -*- coding: utf-8 -*- *) | |
open OUnit2 | |
open Parse1 | |
open Sexparse1 | |
module SC = Strcursor | |
type spec = int * string * int * sexp | |
let () = run_test_tt_main | |
("suite">::: | |
(List.map | |
(fun (start, str, last, sexp) -> | |
str>::(fun _ -> | |
let res, sc = read ~start str in | |
assert_equal | |
(res, SC.pos sc) | |
(sexp, last) | |
) | |
) | |
[0, "(foo bar baz)", 13 | |
, List [Symbol "foo"; Symbol "bar"; Symbol "baz"] | |
;0, "(foo 123 baz)", 13 | |
, List [Symbol "foo"; Int 123; Symbol "baz"] | |
;0, "(foo '123 baz)", 14 | |
, List [Symbol "foo"; List [Symbol "quote"; Int 123]; Symbol "baz"] | |
;0, "(foo 123) bar 'baz", 9 | |
, List [Symbol "foo"; Int 123] | |
;9, "(foo 123) bar 'baz", 13 | |
, Symbol "bar" | |
;13, "(foo 123) bar 'baz", 18 | |
, List [Symbol "quote"; Symbol "baz"] | |
;18, "(foo 123) bar 'baz", 18 | |
, List [] | |
;0, "(number 1 -1 )", 14 | |
, List [Symbol "number"; Int 1; Int (-1)] | |
;0, "(str \"foo \\x bar\\\"baz qux\" hoe)", 31 | |
, List [Symbol "str"; String "foo x bar\"baz qux"; | |
Symbol "hoe"] | |
]) | |
) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment