Last active
April 23, 2018 02:59
-
-
Save hkoba/5495600 to your computer and use it in GitHub Desktop.
Toy S-expression parser in OCaml with recursive descent parsing.
Only supports symbol, integer and list. (This is just a partial port of brilliant awklisp: https://github.com/darius/awklisp/blob/master/awklisp )
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module CharClass = | |
struct | |
let is_space c = c = ' ' || c = '\t' | |
let strrange str s e = | |
String.sub str s (e-s) | |
let strseek str start fn = | |
let pos = ref start | |
and end_ = String.length str in | |
while !pos < end_ && (fn (str.[!pos])) do incr pos done; | |
!pos | |
let skip_ws str start = strseek str start (fun c -> is_space c) | |
let seek_ws str start = strseek str start (fun c -> not (is_space c)) | |
let strtok str start = | |
if (String.length str) <= start then | |
None | |
else | |
let ts = skip_ws str start in | |
if (String.length str) <= ts then | |
None | |
else | |
Some (ts, seek_ws str ts) | |
exception Bad_ccdef of string | |
let ccdefs_of_string str = | |
let ccdef str start end_ = | |
let s = strrange str start end_ | |
and len = end_ - start in | |
if len = 1 then | |
let n = int_of_char str.[start] in | |
(n, n, s) | |
else if str.[start] = '-' then | |
let n = int_of_string (strrange str (start+1) end_) in | |
(n, n, s) | |
else if len = 3 && str.[start+1] = '-' then | |
(int_of_char str.[start], int_of_char str.[start+2], s) | |
else | |
raise (Bad_ccdef s) | |
in | |
let rec parse str pos = | |
match (strtok str pos) with | |
None -> [] | |
| Some (ts, te) -> (ccdef str ts te) :: (parse str te) | |
in | |
parse str 0 | |
let ccdef_fst (x, _, _) = x | |
let ccdef_snd (_, x, _) = x | |
let ccdef_trd (_, _, x) = x | |
let foreach lst fn = List.iter fn lst | |
let cc_of_string str = | |
let cc = Array.create 256 false | |
and ccdefs = ccdefs_of_string str | |
in | |
foreach ccdefs (fun ccr -> | |
for i = (ccdef_fst ccr) to (ccdef_snd ccr) do | |
cc.(i) <- true | |
done | |
); | |
cc | |
let in_cc cc c = cc.(int_of_char c) | |
let cc_match_one cc str pos = | |
in_cc cc str.[!pos] && (incr pos; true) | |
let cc_match_many cc str pos = | |
let nmatch = ref 0 in | |
while (!pos < String.length str) && in_cc cc str.[!pos] | |
do incr pos; incr nmatch done; | |
!nmatch > 0 | |
let char_match c str pos = | |
str.[!pos] = c && (incr pos; true) | |
end | |
module Sexparse = | |
struct | |
open CharClass | |
let cc_paren = cc_of_string "( ) '" | |
let cc_atom = cc_of_string "_ A-Z a-z 0-9 = ! @ $ % & * < > ? + \\ - * / :" | |
let cc_ws = cc_of_string "-9 -32" | |
let cc_digit = cc_of_string "0-9" | |
let ignore v = () | |
let skip_ws str pos = ignore (cc_match_many cc_ws str pos) | |
type sexp = Symbol of string | Int of int | Lst of sexp list | |
exception Syntax_error of string | |
let read ?(start=0) str = | |
let rec _read str pos = | |
skip_ws str pos; | |
let start = !pos in | |
if (String.length str) <= !pos then | |
Lst [] | |
else if char_match '(' str pos; then | |
Lst (_list str pos []) | |
else if char_match '\'' str pos; then | |
Lst ((Symbol "quote") :: [_read str pos]) | |
else if cc_match_many cc_digit str pos then | |
Int (int_of_string (strrange str start !pos)) | |
else if cc_match_many cc_atom str pos then | |
Symbol (strrange str start !pos) | |
else | |
raise (Syntax_error "foo") | |
and _list str pos res = | |
skip_ws str pos; | |
if char_match ')' str pos then | |
List.rev res | |
else | |
_list str pos (_read str pos :: res) | |
in | |
let pos = (ref start) in | |
let res = _read str pos in | |
res, !pos | |
end | |
;; | |
(* | |
print_endline (Std.dump (Sexparse.read ~start:19 "(foo '123 baz) bar baz")) | |
*) | |
(* | |
# open Sexparse;; | |
# read "(foo bar baz)";; | |
- : Sexparse.sexp * int = | |
(Lst [Symbol "foo"; Symbol "bar"; Symbol "baz"], 13) | |
# read "(foo 123 baz)";; | |
- : Sexparse.sexp * int = (Lst [Symbol "foo"; Int 123; Symbol "baz"], 13) | |
# read "(foo '123 baz)";; | |
- : Sexparse.sexp * int = | |
(Lst [Symbol "foo"; Lst [Symbol "quote"; Int 123]; Symbol "baz"], 14) | |
# read "(foo 123) bar 'baz";; | |
- : Sexparse.sexp * int = (Lst [Symbol "foo"; Int 123], 9) | |
# read ~start:9 "(foo 123) bar 'baz";; | |
- : Sexparse.sexp * int = (Symbol "bar", 13) | |
# read ~start:13 "(foo 123) bar 'baz";; | |
- : Sexparse.sexp * int = (Lst [Symbol "quote"; Symbol "baz"], 18) | |
# read ~start:18 "(foo 123) bar 'baz";; | |
- : Sexparse.sexp * int = (Lst [], 18) | |
# | |
*) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment