Created
May 12, 2026 19:24
-
-
Save moea/dfdb6ad6abdafcc8fc6e1ce93e3f5d9c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ;; ============================================================================ | |
| ;; JSON parser, built on algebraic effects + multi-shot continuations, | |
| ;; with a homegrown combinator DSL on top of `defmacro`. | |
| ;; | |
| ;; The bottom of the file (the actual grammar) reads close to BNF: | |
| ;; | |
| ;; (deftoken null "null" (JNull)) | |
| ;; (deftoken true "true" (JBool true)) | |
| ;; (defparser json-array | |
| ;; (JArr (between! "[" (sep! json-value ",") "]"))) | |
| ;; | |
| ;; Everything above the grammar is plumbing: Parse + Choose effects, | |
| ;; the parser combinator zoo, and a macro layer that turns combinator | |
| ;; calls back into something that scans like a grammar. | |
| ;; | |
| (namespace examples.json) | |
| ;; ---------------------------------------------------------------------------- | |
| ;; JSON value type | |
| ;; ---------------------------------------------------------------------------- | |
| (deftype Json [] | |
| (JNull) | |
| (JBool bool) | |
| (JNum float) | |
| (JStr string) | |
| (JArr (Vector Json)) | |
| (JObj (Map (Pair string Json)))) | |
| ;; ---------------------------------------------------------------------------- | |
| ;; Effects | |
| ;; | |
| ;; Multi-shot continuations matter because the cursor lives in a | |
| ;; `with-state` handler, encoded as a state-passing function (see | |
| ;; `stdlib/core/effects.stutter`). Each `(resume k …)` re-enters the body | |
| ;; with the cursor as it was at the choice point — so `<|>` does NOT | |
| ;; need to snapshot / restore the cursor by hand. | |
| ;; ---------------------------------------------------------------------------- | |
| (effect Parse | |
| (peek :: (-> (Maybe char))) | |
| (advance :: (-> unit)) | |
| (pos :: (-> int)) | |
| (seek :: (-> int unit))) | |
| (effect Choose | |
| (choose :: (forall [A] (-> (List A) A))) | |
| (fail-choice :: (forall [A] (-> A)))) | |
| ;; ---------------------------------------------------------------------------- | |
| ;; Combinators | |
| ;; ---------------------------------------------------------------------------- | |
| (define satisfy :: | |
| (forall [e] (-> (-> char bool) char ! <Parse Exn ...e>)) | |
| (fn [pred] | |
| (match (peek) | |
| ((None) (raise :eof)) | |
| ((Some c) (if (pred c) | |
| (do (advance) c) | |
| (raise :unexpected)))))) | |
| (define any-char :: (forall [e] (-> char ! <Parse Exn ...e>)) | |
| (fn [] (satisfy (fn [_] true)))) | |
| (define char1 :: (forall [e] (-> char char ! <Parse Exn ...e>)) | |
| ;; Named `char1` rather than `char` to avoid shadowing the `char` type. | |
| (fn [c] (satisfy (fn [x] (char-eq x c))))) | |
| (define eof :: (forall [e] (-> unit ! <Parse Exn ...e>)) | |
| (fn [] | |
| (match (peek) | |
| ((None) :unit) | |
| ((Some _) (raise :expected-eof))))) | |
| (define <|>-impl :: | |
| (forall [A e] | |
| (-> (-> A ! <Parse Exn Choose ...e>) | |
| (-> A ! <Parse Exn Choose ...e>) | |
| A ! <Parse Exn Choose ...e>)) | |
| (fn [p q] | |
| (if (choose '(true false)) (p) (q)))) | |
| (define many :: | |
| (forall [A e] | |
| (-> (-> A ! <Parse Exn Choose ...e>) | |
| (Vector A) ! <Parse Choose ...e>)) | |
| (fn [p] | |
| (<|>-impl | |
| (fn [] (let [x (p)] (seq-cons x (many p)))) | |
| (fn [] (empty-vec))))) | |
| (define many1 :: | |
| (forall [A e] | |
| (-> (-> A ! <Parse Exn Choose ...e>) | |
| (Vector A) ! <Parse Exn Choose ...e>)) | |
| (fn [p] (let [x (p)] (seq-cons x (many p))))) | |
| (define sep-by :: | |
| (forall [A B e] | |
| (-> (-> A ! <Parse Exn Choose ...e>) | |
| (-> B ! <Parse Exn Choose ...e>) | |
| (Vector A) ! <Parse Choose ...e>)) | |
| (fn [p sep] | |
| (<|>-impl | |
| (fn [] (let [x (p) | |
| xs (many (fn [] (do (sep) (p))))] | |
| (seq-cons x xs))) | |
| (fn [] (empty-vec))))) | |
| (define between-impl :: | |
| (forall [A B C e] | |
| (-> (-> A ! <Parse Exn Choose ...e>) | |
| (-> B ! <Parse Exn Choose ...e>) | |
| (-> C ! <Parse Exn Choose ...e>) | |
| B ! <Parse Exn Choose ...e>)) | |
| (fn [open p close] | |
| (do (open) (let [x (p)] (do (close) x))))) | |
| (define string-literal :: | |
| (forall [e] (-> string string ! <Parse Exn ...e>)) | |
| (fn [s] | |
| (let [cs (string->chars s)] | |
| (loop [i 0] | |
| (if (= i (count cs)) | |
| s | |
| (do (char1 (nth cs i)) | |
| (recur (+ i 1)))))))) | |
| (define lexeme :: | |
| (forall [A e] | |
| (-> (-> A ! <Parse Exn Choose ...e>) A ! <Parse Exn Choose ...e>)) | |
| (fn [p] | |
| (let [x (p)] | |
| (do (skip-ws) x)))) | |
| ;; Forward-referenced via `skip-ws`; defined via a macro below once | |
| ;; `one-of`/`char-class` are in scope. | |
| ;; ============================================================================ | |
| ;; Lean on pattern-template macros to turn the combinator surface | |
| ;; into something that reads like a grammar. Features used: | |
| ;; - rest patterns + rest splicing (`?xs ...`) for n-ary forms | |
| ;; - recursive expansion for unfolding n-ary into binary | |
| ;; - `:when` guards for class-keyword dispatch | |
| ;; - `id-join` for generative identifier construction | |
| ;; - hygiene: bare-name binders in `(fn [...] B)` are gensym'd | |
| ;; per expansion, so the helper macros below don't capture any | |
| ;; user-supplied symbol. | |
| ;; | |
| ;; Everything below this point is pure surface sugar; deleting all the | |
| ;; macros would leave a working (if ugly) parser that uses the | |
| ;; combinators directly. The point is to show how far the macro system | |
| ;; carries us without language-level extension. | |
| ;; ============================================================================ | |
| ;; ---- n-ary <|> ------------------------------------------------------------ | |
| ;; | |
| ;; `(alt P1 P2 P3)` → `(<|>-impl (fn [] P1) (<|>-impl (fn [] P2) (fn [] P3)))`. | |
| ;; Recursive expansion unfolds the n-ary form; the empty case is | |
| ;; deliberately a parse-error so `(alt)` fails fast at use. | |
| (defmacro alt | |
| [(alt) -> (raise :empty-alt)] | |
| [(alt ?p) -> ?p] | |
| [(alt ?p ?rest ...) -> (<|>-impl (fn [] ?p) (fn [] (alt ?rest ...)))]) | |
| ;; ---- character-class predicates ------------------------------------------- | |
| ;; | |
| ;; `(any-of? c #\a #\b #\c)` → `(or (char-eq c #\a) (or (char-eq c #\b) | |
| ;; (char-eq c #\c)))`. Used to build satisfy predicates from char-literal | |
| ;; sets without writing the chained `or` by hand. The variable threaded | |
| ;; through is captured by name and re-used at each leaf — hygiene means | |
| ;; `c` here doesn't collide with any `c` in user code. | |
| (defmacro any-of? | |
| [(any-of? ?x) -> false] | |
| [(any-of? ?x ?c) -> (char-eq ?x ?c)] | |
| [(any-of? ?x ?c ?rest ...) -> (or (char-eq ?x ?c) (any-of? ?x ?rest ...))]) | |
| ;; `(char-class :digit)`, `(char-class :alpha)`, etc. — each yields a | |
| ;; `(-> char bool)` predicate. The `:any-of` and `:not` shapes compose; | |
| ;; pattern matching on the leading keyword keeps the macro readable. | |
| (defmacro char-class | |
| [(char-class :digit) -> char-digit?] | |
| [(char-class :alpha) -> char-alpha?] | |
| [(char-class :whitespace) -> char-whitespace?] | |
| [(char-class :any-of ?cs ...) | |
| -> (fn [c] (any-of? c ?cs ...))] | |
| [(char-class :not ?cls) | |
| -> (fn [c] (not ((char-class ?cls) c)))]) | |
| ;; `(one-of #\[ #\])` → satisfy-disjunction parser. Eliminates the | |
| ;; "satisfy + or-chain of char-eq" pattern entirely. | |
| (defmacro one-of | |
| [(one-of ?cs ...) -> (satisfy (char-class :any-of ?cs ...))]) | |
| (defmacro none-of | |
| [(none-of ?cs ...) -> (satisfy (char-class :not (:any-of ?cs ...)))]) | |
| ;; ---- pure / =>: lift constants & map parser results ----------------------- | |
| ;; `(pure V)` → a parser thunk that succeeds without consuming and returns V. | |
| (defmacro pure | |
| [(pure ?v) -> (fn [] ?v)]) | |
| ;; `(=> P F)` → run P, apply F to the result. The function form `F` is | |
| ;; substituted as-is; if it's a constructor (`JArr`) or a let-bound fn, | |
| ;; the result is direct-style. | |
| (defmacro => | |
| [(=> ?p ?f) -> (?f (?p))]) | |
| ;; ---- lexeme + token literals --------------------------------------------- | |
| ;; `(token "...")` → match the literal, then skip trailing whitespace. | |
| ;; Wrapping in lexeme means tokens normalize whitespace as we go, so | |
| ;; the grammar doesn't have to think about it. | |
| (defmacro token | |
| [(token ?s) -> (lexeme (fn [] (string-literal ?s)))]) | |
| ;; `(between! "[" body "]")` — wraps body between two string-literal | |
| ;; tokens. The `!` distinguishes from the underlying `between-impl` | |
| ;; combinator; the macro thunks each arg for us. | |
| (defmacro between! | |
| [(between! ?open ?body ?close) | |
| -> (between-impl (fn [] (token ?open)) | |
| (fn [] ?body) | |
| (fn [] (token ?close)))]) | |
| ;; `(sep! p ",")` — sep-by with a literal-string separator. | |
| (defmacro sep! | |
| [(sep! ?p ?sep) | |
| -> (sep-by (fn [] ?p) (fn [] (token ?sep)))]) | |
| ;; ---- defparser: name + body → top-level parser define -------------------- | |
| ;; | |
| ;; The signature is the canonical row `<Parse Exn Choose>`. Pinning the | |
| ;; row at the macro means rule bodies don't have to write the signature | |
| ;; per rule — but they're still elaborated against the full effect row, | |
| ;; so any effect mismatch surfaces at the body, not at the wrapper. | |
| (defmacro defparser | |
| [(defparser ?name ?ret ?body) | |
| -> (define ?name :: | |
| (forall [e] (-> ?ret ! <Parse Exn Choose ...e>)) | |
| (fn [] ?body))]) | |
| ;; ---- deftoken: keyword literal → constant parser -------------------------- | |
| ;; | |
| ;; `(deftoken null "null" (JNull))` generates: | |
| ;; | |
| ;; (define json-null :: (forall [e] (-> Json ! <Parse Exn Choose ...e>)) | |
| ;; (fn [] (do (token "null") (JNull)))) | |
| ;; | |
| ;; The generated identifier `json-null` is built via `id-join` from | |
| ;; the literal prefix `"json-"` and the user-supplied tag. This is | |
| ;; the most "ambitious" macro in the file — generative naming pulled | |
| ;; from a template argument. | |
| (defmacro deftoken | |
| [(deftoken ?tag ?lit ?val) | |
| -> (define (id-join "json-" ?tag) :: | |
| (forall [e] (-> Json ! <Parse Exn Choose ...e>)) | |
| (fn [] (do (token ?lit) ?val)))]) | |
| ;; ---- skip-ws (now that we have one-of-style sugar) ----------------------- | |
| (define skip-ws :: (forall [e] (-> unit ! <Parse Choose ...e>)) | |
| (fn [] | |
| (let [_ (many (fn [] (satisfy (char-class :whitespace))))] | |
| :unit))) | |
| ;; ============================================================================ | |
| ;; This is the payoff. Compare to the pre-macro version: every rule | |
| ;; now reads as the grammar production it represents. | |
| ;; | |
| ;; Stretch goal (TODO(elab)): a single-form `(grammar ...)` macro that | |
| ;; emits multiple top-level `define`s in one shot. defmacro | |
| ;; expansion produces ONE surf form per macro call, so today a real | |
| ;; multi-emit `grammar` macro needs elaborator support for a | |
| ;; `(splice-defs (define ...) ...)` form at the top level. Without | |
| ;; that, we write one `defparser` / `deftoken` per rule. The | |
| ;; per-rule sugar already does most of the work. | |
| ;; ============================================================================ | |
| ;; null / true / false — three rules, three lines. | |
| (deftoken null "null" (JNull)) | |
| (deftoken true "true" (JBool true)) | |
| (deftoken false "false" (JBool false)) | |
| ;; ---- numbers -------------------------------------------------------------- | |
| (define digit-chars :: | |
| (forall [e] (-> (Vector char) ! <Parse Exn Choose ...e>)) | |
| (fn [] (many1 (fn [] (satisfy (char-class :digit)))))) | |
| (defparser sign-chars (Vector char) | |
| (alt (let [c (char1 #\-)] (seq-cons c (empty-vec))) | |
| (empty-vec))) | |
| (defparser frac-chars (Vector char) | |
| (alt (let [dot (char1 #\.) | |
| ds (digit-chars)] | |
| (seq-cons dot ds)) | |
| (empty-vec))) | |
| (defparser exp-chars (Vector char) | |
| (alt (let [e (alt (char1 #\e) (char1 #\E)) | |
| sg (alt (let [c (alt (char1 #\+) (char1 #\-))] | |
| (seq-cons c (empty-vec))) | |
| (empty-vec)) | |
| ds (digit-chars)] | |
| (seq-append (seq-cons e sg) ds)) | |
| (empty-vec))) | |
| (defparser json-number Json | |
| (lexeme | |
| (fn [] | |
| (let [s (sign-chars) | |
| int (digit-chars) | |
| fr (frac-chars) | |
| ex (exp-chars) | |
| raw (chars->string | |
| (seq-append (seq-append s int) (seq-append fr ex)))] | |
| (match (str->float raw) | |
| ((None) (raise :bad-number)) | |
| ((Some f) (JNum f))))))) | |
| ;; ---- strings -------------------------------------------------------------- | |
| ;; | |
| ;; The `(=> (any-char) escape-decode)` shape on the escape branch shows | |
| ;; the macro layer doing real work: the macro flips control flow to | |
| ;; the helper without obscuring intent. | |
| (define escape-decode :: (-> char char) | |
| (fn [c] | |
| (cond | |
| (char-eq c #\") #\" | |
| (char-eq c #\\) #\\ | |
| (char-eq c #\/) #\/ | |
| (char-eq c #\n) #\newline | |
| (char-eq c #\t) #\tab | |
| (char-eq c #\r) #\return | |
| (char-eq c #\b) (int->char 8) | |
| (char-eq c #\f) (int->char 12) | |
| ;; TODO: \uXXXX — read 4 hex digits, int->char. Skipped here. | |
| true (error "json: bad escape")))) | |
| (defparser json-string-char char | |
| (alt (do (char1 #\\) (=> any-char escape-decode)) | |
| (none-of #\" #\\))) | |
| (defparser json-string-body string | |
| (=> (fn [] (many json-string-char)) chars->string)) | |
| (defparser json-string Json | |
| (lexeme (fn [] | |
| (=> (fn [] | |
| (between-impl | |
| (fn [] (char1 #\")) | |
| json-string-body | |
| (fn [] (char1 #\")))) | |
| JStr)))) | |
| ;; ---- arrays & objects ---------------------------------------------------- | |
| (defparser json-array Json | |
| (=> (fn [] (between! "[" (sep! (json-value) ",") "]")) JArr)) | |
| (defparser json-pair (Pair string Json) | |
| (let [k (lexeme (fn [] | |
| (between-impl | |
| (fn [] (char1 #\")) | |
| json-string-body | |
| (fn [] (char1 #\"))))) | |
| _ (token ":") | |
| v (json-value)] | |
| (Pair k v))) | |
| (define pairs->map :: | |
| (-> (Vector (Pair string Json)) (Map (Pair string Json))) | |
| (fn [pairs] | |
| (reduce (fn [m p] (map-assoc m (:k p) (:v p))) | |
| ${} | |
| pairs))) | |
| (defparser json-object Json | |
| (=> (fn [] (=> (fn [] (between! "{" (sep! (json-pair) ",") "}")) | |
| pairs->map)) | |
| JObj)) | |
| ;; The whole grammar's dispatch — what a recursive-descent parser | |
| ;; usually buries in nested if-else, now one `alt` per rule. | |
| (defparser json-value Json | |
| (alt (json-null) | |
| (json-true) | |
| (json-false) | |
| (json-number) | |
| (json-string) | |
| (json-array) | |
| (json-object))) | |
| ;; ============================================================================ | |
| ;; Runners | |
| ;; ============================================================================ | |
| (deftype Result [A E] | |
| (Ok A) | |
| (Err E)) | |
| (define run-first :: | |
| (forall [A e] | |
| (-> (-> A ! <Choose Exn ...e>) (Result A keyword) ! <Exn ...e>)) | |
| (fn [thunk] | |
| ((handler | |
| (return v -> (Ok v)) | |
| ((choose xs) k -> | |
| (loop [xs xs] | |
| (if (empty? xs) | |
| (Err :no-alternative) | |
| (let [r (exn-or-default (Err :branch-failed) | |
| (fn [] (resume k (first xs))))] | |
| (match r | |
| ((Ok _) r) | |
| ((Err _) (recur (rest xs)))))))) | |
| ((fail-choice) -> (Err :no-alternative))) | |
| thunk))) | |
| (define run-parse :: | |
| (forall [A e] | |
| (-> string (-> A ! <Parse Choose Exn ...e>) | |
| (Result A keyword) ! <...e>)) | |
| (fn [src thunk] | |
| (let [buf (string->chars src)] | |
| (with-state 0 | |
| (fn [] | |
| ((handler | |
| (return v -> v) | |
| ((peek) k -> | |
| (let [i (get)] | |
| (if (< i (count buf)) | |
| (resume k (Some (nth buf i))) | |
| (resume k (None))))) | |
| ((advance) k -> (do (put (+ (get) 1)) (resume k :unit))) | |
| ((pos) k -> (resume k (get))) | |
| ((seek p) k -> (do (put p) (resume k :unit)))) | |
| (fn [] | |
| (exn-or-default (Err :parse-error) | |
| (fn [] (run-first thunk)))))))))) | |
| (define parse-json :: (-> string (Result Json keyword)) | |
| (fn [src] | |
| (run-parse src | |
| (fn [] | |
| (do (skip-ws) | |
| (let [v (json-value)] | |
| (do (eof) v))))))) | |
| ;; ============================================================================ | |
| ;; Demo | |
| ;; ============================================================================ | |
| (print "null =>") (print (parse-json "null")) | |
| (print "true =>") (print (parse-json " true ")) | |
| (print "42 =>") (print (parse-json "42")) | |
| (print "[1,2,3] =>") (print (parse-json "[1, 2, 3]")) | |
| (print "nested =>") (print (parse-json "{\"a\": 1, \"b\": [true, null]}")) | |
| (print "malformed =>") (print (parse-json "{\"a\": ,}")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment