Last active
July 7, 2021 16:44
-
-
Save tangentstorm/74a1596e2ee1fcc0a395454a5f07e93a to your computer and use it in GitHub Desktop.
parser combinators for J
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
clear'' | |
NB. Parser Combinators for J | |
NB. | |
NB. The semantics here are heavily inspired by | |
NB. Allesandro Warth's ometa system: | |
NB. | |
NB. http://tinlizzie.org/ometa/ | |
NB. | |
NB. but implemented as parser combinators rather than a standalone language. | |
NB. | |
LICENSE =: (0 : 0) | |
Copyright (c) 2021 Michal J Wallace <http://tangentstorm.com/> | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
) | |
NB. s0 : s. initial parse state | |
s0 =: 0 ; '' ; 6#a: | |
NB. these names are the indices into the tuple: | |
'ix ch tb nt na nb wk ib' =: i.#s0 | |
NB. type s = (ix;ch;tb;nb;nt;na;wb) | |
NB. ix = current index into the input | |
NB. ch = current character, or '' after ix>#S | |
NB. tb = token buffer (grows as we match each char) | |
NB. nt = node tag | |
NB. na = node attributes | |
NB. nb = node buffer (grows as we build rules) | |
NB. wk = work stack (grows with recursive descent) | |
NB. ib = input buffer | |
NB. type fs = (bit;<s). flag + parse state | |
NB. the flag indicates whether the rule that | |
NB. was just invoked matched the input or not. | |
NB. accessor methods (combine with the names above: 'ix at state') | |
at =: >@{ NB. fetch and unbox at index x from boxes y | |
aa =: {{ (u&.(>@]) n{ y) n} y }} NB. apply u at ix n in array of boxes y | |
S =: ib&at NB. retrieve the input string | |
NB. nx :: state->state = move to next character (ch-:'' if past end) | |
nx =: {{'nx'] (i; <i{ ::'' S y) 0 1 } ,&(ch at y) aa tb y [ i=: 1 + ix at y }} | |
NB. match: string -> s (initial parser state) | |
NB. everything is stored explicitly inside | |
NB. the state tuple, to make it easy to backtrack. | |
match =: {{ (<y) ib } (<{.y) ch} s0 }} | |
NB. u parse: string -> parse tree | error | |
NB. applies rule u to (match y) and returns node buffer on success. | |
parse =: {{ if.f['f s'=.u match y do. >nb{s else. ,.'parse failed';<s end. }} | |
NB. u lex: string -> tokens | error | |
NB. applies rule u to (match y) and returns token buffer on success. | |
lex =: {{ if.f['f s'=.u match y do. >tb{s else. ,.'parse failed';<s end. }} | |
NB. parser combinators | |
NB. -------------------------------------------------- | |
NB. these are all adverbs (or conjunctions in the case of 'sep') | |
NB. that take an argument describing what to match and produce | |
NB. a verb from s->fs | |
NB. nil: s->fs. always match, but consume nothing. | |
nil =: 1;< | |
NB. any: s->fs. matches one input item, unless out of bounds. | |
any =: {{'any'] f;<nx^:f y [ f =. (#S y)>ix at y }} | |
NB. u neg: s->fs. consume nothing. | |
NB. This primitive allows you to implement lookahead. | |
neg =: {{'neg'] (-.f);< y [ 'f s' =. u y }} | |
NB. u end: s->fs. matches at end of input. | |
end =: any neg | |
NB. r try : s->fs. generic error trap. mostly this handles | |
NB. the case where we're reading past the end of the input. | |
try =: :: (0;<) | |
NB. m chr: s->fs. match literal atom m and advance the index | |
chr =: {{'chr'] p;<nx^:p y [ p =. m -: ch at y }} try | |
NB. m one: s->fs. match one item from m and advance the index. | |
one =: {{'one'] p;<nx^:p y [ p =. m e.~ ch at y }} try | |
NB. m seq: s->fs. match each rule in sequence m | |
seq =: {{'seq'] s=:y | |
for_r. m do. | |
if. -.f['f s'=. r`:6 s do. 0;<y return. end. | |
end. 1;<s }} | |
NB. m alt: s->fs. try each rule in m until one matches. | |
NB. This is "Prioritized Choice" from PEG parsers. | |
NB. It removes some ambiguity, but means you have to think | |
NB. carefully about how to order your rules. For example, | |
NB. if your language allows simple strings of letters to be | |
NB. used as names but also reserves some strings of letters | |
NB. as keywords, then you must specify the keywords first. | |
alt =: {{'alt'] s=:y | |
for_r. m do. | |
if. f['f s'=. r`:6 s do. 1;<s return. end. | |
end. 0;<y }} | |
NB. m lit: s->fs like seq for literals only. | |
NB. this just matches the whole sequence directly vs S. | |
NB. ,m is so we can match a single character. | |
lit =: {{'lit'] f;<nx^:(f*#m) y [ f=.m-:(S y){~(ix at y)+i.#m=.,m }} try | |
NB. eat =: {{ if. f['f s'=.u y do. s=. a: tb} ,&(tb{s) aa nb s end. f;<s }} | |
NB. zap =: {{ t =. tb{y if. f['f s'=.u y do. s=. t tb} s end. f;<s }} | |
NB. u ifu v: s->fs. if u matches, return 1;<(s_old) v (s_new) | |
ifu =: {{'ifu' if.f['f s'=.u y do. s=.y v s end. f;<s }} | |
NB. u tok: s->fs move current token to nb if u matches, else fail | |
tok =: ifu {{x] a: tb} ,&(tb{y) aa nb y }} | |
tok =: ifu {{'tok']x] a: tb} (tb at y) emit y }} | |
NB. m sym: s->fs alias for 'm lit tok' | |
sym =: lit tok | |
NB. u zap: s->fs match if u matches, but drop any generated nodes | |
NB. the only effect that persists is the current char and index. | |
zap =: ifu {{'zap'] ((ch,ix){y) (ch,ix) } x }} | |
NB. u opt: s->fs. optionally match rule u. succeed either way | |
opt =: {{ 1 ; }. u y }} | |
opt =: `nil alt | |
NB. u rep: s->fs. match 1+ repetitions of u | |
rep =: {{'rep'] f=.0 [ s=.y while. fi['fi s'=.u s do. f=.1 end. f;<s }} | |
NB. u orp: s->fs. optionally repeat (match 0+ repetitions of u)}} | |
orp =: rep opt | |
NB. u not: s->fs. match anything but u. | |
NB. fail if u matches or end of input, otherwise consume 1 input. | |
not =:{{'not' | |
if. (#S y) <: ix at y do. 0;<y | |
elseif. f['f s'=.{. u y do. 0;<y | |
else. 1;<nx y end. }} | |
not =: {{('not'] (u neg)`any) seq }} | |
NB. u sep v: s->fs. match 1 or more u, separated by v | |
sep =: {{ (u f.)`((v f.)`(u f.) seq orp) seq }} | |
NB. plain functions for tree building | |
NB. --------------------------------- | |
NB. ntup: the state indices to copy (in order) for current node | |
ntup =: nt,na,nb | |
NB. x node: s->s. starts a new node in the parse tree with tag x | |
NB. copies current node tuple to work stack | |
node =: {{ (<x) nt } a: ntup } ,&(<ntup{y) aa wk y }} | |
NB. x emit: s->s push item x into the current node buffer | |
emit =: {{ ,&(<x) aa nb y }} | |
NB. m attr n: s->fs. append (m=key;n=value) pair to the attribute dictionary. | |
NB. initialize dict if needed | |
attr =: {{ if. a:-:na{s do. s=. (<0 2$a:) na} s end. ,&(m;n) aa na y }} | |
NB. done: s->s. closes current node and makes it an item of previous node-in progress. | |
done =: {{ | |
new =. ntup { y NB. temp storage for the node we're closing. | |
'old s' =. wk tk y NB. pop the previous context | |
s =. (>old) ntup } s NB. insert it into the state | |
new emit s }} NB. and append new node to the node buffer. | |
NB. x tk: s->(item;<s). pop the last item from buffer x in state y. | |
tk =: {{ item ; < }: aa x y [ item =. {:>x{y }} | |
NB. combinators for tree building. | |
NB. ------------------------------ | |
NB. u elm n : s->fs. create node element tagged with n if u matches | |
elm =: {{'elm' if.f['f s'=.u n node y do. 1;<done s else. 0;<y end. }} | |
NB. u atr n : s->fs. if u matched, move last item to node attribute n. | |
atr =: {{'atr' if.f['f s'=. u y do. 1;<n attr it s [ 'it s'=. nb tk s else. 0;<y end. }} | |
NB. u tag: s->fs. move the last token in node buffer to be the node's tag. | |
NB. helpful for rewriting infix notation, eg (a head(+) b) -> (+ (a b)) | |
tag =: {{'tag' if.f['f s'=. u y do. 1;<tok nt } s['tok s' =. nb take y else. 0;<y end. }} | |
NB. character sets | |
alpha =: a.{~ , (i.26) +/ a.i.'Aa' | |
digit =: '0123456789' | |
hexit =: digit,'AaBbCcDdEeFf' | |
other =: (32}.127{.a.)-.alpha,digit | |
paren =: '()' | |
brack =: '[]' | |
curly =: '{}' | |
space =: 32{.a. NB. that's all ascii ctrl chars. | |
NB. Some predefined tokens | |
NL =: (CR lit opt)`(LF lit) seq | |
ALPHA =: alpha one | |
UNDER =: '_' lit | |
DIGIT =: digit one | |
NUMBER =: digit rep | |
IDENT =: (ALPHA`(ALPHA`DIGIT`UNDER alt orp) seq) | |
HEXIT =: hexit one | |
LPAREN =: '(' lit | |
RPAREN =: ')' lit | |
LBRACK =: '[' lit | |
RBRACK =: ']' lit | |
LCURLY =: '{' lit | |
RCURLY =: '}' lit | |
WS =: (TAB,' ') one | |
NB. generic line splitter | |
lines =: {{ ,.> nb at s [ 'f s' =. (NL not rep) tok sep (NL zap) match y }} | |
NB. j syntax rules | |
NB. ------------------------------- | |
NB. fragments used by the tokens: | |
j_op =: (brack,curly,other-.'_') one | |
j_num =: ('_' lit opt)`(DIGIT rep)`(('.'lit)`(DIGIT orp) seq opt) seq | |
squo =: '''' | |
squl =: squo lit | |
j_esc =: (2#squo) lit | |
NB. full tokens | |
J_LDEF =: '{{'lit | |
J_RDEF =: '}}'lit | |
J_NB =: (('NB','.')lit)`(NL not rep) seq | |
J_STR =: squl`(j_esc`(squl not) alt orp)`squl seq | |
J_OPER =: (j_op`DIGIT`ALPHA alt)`('.:' one rep) seq | |
J_OP =: j_op | |
J_NUMS =: j_num`('j'lit`j_num seq opt)seq sep (WS rep) | |
J_TOKEN =: NL`J_LDEF`J_RDEF`LPAREN`RPAREN`J_NB`J_STR`J_OPER`J_NUMS`J_OP`IDENT alt | |
J_LEXER =: (WS zap)`(J_TOKEN tok) alt rep | |
J_STR match h=.'''hello'',abc' | |
jsrc =: 0 : 0 | |
avg =: +/ % # NB. average is sum div len | |
avg +/\>:i.10 NB. average of first 10 triangle nums (22) | |
name =. 'Sally O''Malley' | |
) | |
[ expect =: (;:jsrc) | |
[ actual =: J_LEXER parse jsrc | |
assert expect -: actual | |
examples =: {{ | |
nx^:(<#S) s0 =: match S=.'hello (world 123)' | |
'a'lit rep match 'aaa' | |
ab =: ('a'lit)`('b'lit) | |
ab seq match 'abcat' | |
'c'lit opt match 'cat' NB. 1 | |
'c'lit opt match 'bat' NB. 1 | |
'c'lit match 'bat' NB. 0 | |
'a'lit rep match 'aaaahello' | |
'a' lit sep (','lit) match 'a,a,a.b' | |
('hello'lit)`(', 'lit zap)`('world'lit) seq match 'hello, world' | |
(NL not) rep match 'hello' | |
hi =: ('hello'sym)`(', 'lit zap)`('world'sym atr 'who') seq | |
[ 'f s' =: hi match 'hello, world' | |
'f s' =: hi elm 'hi' match 'hello, world' | |
}} | |
examples'' NB. no output, but will complain if anything broke | |
nil`any`end `:0@ match each '';'x' | |
s0 | |
'n'node s0 | |
'e' emit 'n'node s0 | |
'n2' node 'e' emit 'n'node s0 | |
'e2' emit 'n2' node 'e' emit 'n'node s0 | |
done 'e2' emit 'n2' node 'e' emit 'n'node s0 | |
'e3' emit done 'e2' emit 'n2' node 'e' emit 'n'node s0 | |
done 'e3' emit done 'e2' emit 'n2' node 'e' emit 'n'node s0 | |
NB. simple pascal -like block | |
BEGIN =: 'begin' sym | |
END =: 'end' sym | |
nends =: END not orp tok | |
block =: (BEGIN`nends`END) seq | |
block match 'begin hello; 2+2; end' | |
TRACE =: 0 | |
NB. m trace v: s->st. provides a trace of parse rule v if TRACE~:0 | |
trace =: {{ | |
if. TRACE do. r [ smoutput m; r=.v Y=:y [ smoutput '>> ',m | |
else. v y end. }} | |
NB. decompiler | |
ar =: 5!:1@< | |
br =: 5!:2@< | |
tr =: 5!:4@< | |
ops =: ;:'nil any lit one seq alt tok sym zap opt rep orp not sep elm atr tag run' | |
all =: ops,;:'try ifu' | |
ALL =: toupper each all | |
(ALL) =: br each all | |
NB. s-expressions (lisp-like syntax) | |
lp =: 'lp' trace (LPAREN zap) | |
rp =: 'rp' trace (RPAREN zap) | |
id =: 'id' trace (lp`rp`WS alt not rep tok) | |
NB. se: s-expression parser | |
se =: 'se' trace (lp`(id`se`(WS zap) alt orp)`rp seq elm '' sep (WS zap)) | |
NB. ll: lisp lexer | |
ll =: (LPAREN tok)`(RPAREN tok)`(WS zap)`id alt rep | |
lisp =: '(one two (a b c) three) (banana)' | |
ll parse lisp | |
se parse lisp | |
NB. tree matching | |
NB. u all: s->fs. matchs if u matches the entire remaining input. | |
all =: {{ (u f.)`end seq }} | |
NB. u box: s->fs. matches if current value is | |
box =: {{ | |
if. 32 = 3!:0 c =. ch at y | |
do. smoutput 'entering box C:' [ C =: > c | |
smoutput c | |
f;<nx^:f s [ 'f s'=.u all match > c else. 0;<y end. }} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment