Created
March 14, 2016 02:55
-
-
Save atg/7f00ea563b56789724ce to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
_ = require 'lodash' | |
{Tokenizer} = require './tokenize' | |
last = _.last | |
# There's two varieties of data structures: 'vals' and 'val' | |
# 'vals' is a branch node that contains many direct children | |
# 'val' is a leaf node that has no direct children | |
makeVals = (kind, vals) -> { kind: kind, vals: vals } | |
makeLeftsRights = (kind, lefts, rights) -> { kind: kind, lefts: lefts, rights: rights } | |
makeVal = (kind, val) -> { kind: kind, val: val } | |
isVal = (obj, kind, val) -> obj.kind == kind and _.isEqual(obj.val, val) | |
isOfKind = (obj, kind) -> obj.kind == kind | |
# ------------------------ | |
# --- Utility functions -- | |
makeRoot = (vals=[]) -> { kind: '_root', vals: vals } | |
isPunct = (tok) -> tok.kind == 'punct' | |
isIdent = (tok) -> tok.kind == 'ident' | |
isString = (tok) -> tok.kind == 'string' | |
isNumber = (tok) -> tok.kind == 'number' | |
isEOL = (tok) -> tok.kind == 'eol' | |
replaceArrayInto = (xs, ys) -> | |
if xs == ys | |
return | |
n = ys.length | |
xs.length = n | |
i = 0 | |
while i < n | |
xs[i] = ys[i] | |
i += 1 | |
printBare = process.stdout.write | |
printJSON = (j) -> | |
console.log(JSON.stringify(j, null, 2)) | |
arrayKeys = ['lefts', 'rights', 'vals', 'table'] | |
printTree = (j, indent=0, shouldIndentFirstLine=true) -> | |
sp = _.repeat(' ', indent * 2) | |
sp0 = if shouldIndentFirstLine then sp else '' | |
isObject = _.isObject(j) | |
if isObject | |
if 'vals' of j | |
process.stdout.write(sp0+"#{j.kind}") | |
printTree(j.vals, indent, false) | |
return | |
if 'table' of j | |
process.stdout.write(sp0+"#{j.kind}") | |
printTree(j.table, indent, false) | |
return | |
if 'lefts' of j and 'rights' of j | |
console.log(sp0+"#{j.kind}(") | |
process.stdout.write(sp+" lefts: ") | |
printTree(j.lefts, indent + 1, false) | |
process.stdout.write(sp+" rights: ") | |
printTree(j.rights, indent + 1, false) | |
console.log(sp+")") | |
return | |
if 'left' of j and 'right' of j | |
console.log(sp0+"#{j.kind}(") | |
printTree(j.left, indent + 1) | |
printTree(j.right, indent + 1) | |
console.log(sp+")") | |
return | |
if _.isArray(j) | |
if j.length | |
console.log(sp0+"[") | |
for val in j | |
printTree(val, indent + 1) | |
console.log(sp+"]") | |
else | |
console.log(sp0+"[]") | |
return | |
# console.log(JSON.stringify(j)) | |
console.log(sp+"#{j.kind}(#{JSON.stringify(j.val)})") | |
# ------------------------- | |
# --- Parsing functions --- | |
Parsing = {} | |
# Splits a token stream into a tree of brackets | |
Parsing._brackets = (toks) -> | |
stack = [makeRoot()] | |
output = [] | |
bracketKindsMap = { | |
'(': 'group', ')': 'group' | |
'[': 'square', ']': 'square' | |
'{': 'block', '}': 'block' | |
} | |
for tok in toks | |
if isPunct(tok) and tok.val in ['(', '[', '{'] | |
bracketKind = bracketKindsMap[tok.val] | |
branch = { 'kind': bracketKind, 'vals': [] } | |
last(stack).vals.push(branch) | |
stack.push(branch) | |
else if isPunct(tok) and tok.val in [')', ']', '}'] and last(stack).kind == bracketKindsMap[tok.val] | |
stack.pop() | |
else | |
last(stack).vals.push(tok) | |
return stack[0].vals | |
# Splits a token stream by a given token | |
Parsing._splitBy = (toks, outerKind, innerKind, splitter) -> | |
group = makeVals(innerKind, []) | |
groups = [] | |
hasFound = false | |
for x in toks | |
if _.isEqual(x, splitter) | |
hasFound = true | |
groups.push(group) if group.vals.length > 0 | |
group = makeVals(innerKind, []) | |
else | |
group.vals.push(x) | |
groups.push(group) if group.vals.length > 0 | |
if hasFound | |
if outerKind? | |
return [makeVals(outerKind, groups)] | |
else | |
return groups | |
else | |
return toks | |
# Splits a token stream by a given token | |
Parsing._splitByIntoTable = (toks, kind, splitter) -> | |
group = [] | |
groups = [] | |
hasFound = false | |
for x in toks | |
if _.isEqual(x, splitter) | |
hasFound = true | |
groups.push(group) if group.length > 0 | |
group = [] | |
else | |
group.push(x) | |
groups.push(group) if group.length > 0 | |
if hasFound | |
return [{ kind: kind, table: groups }] | |
else | |
return toks | |
# Splits a token stream into lines | |
Parsing._splitLines = (toks) -> | |
group = makeVals('line', []) | |
groups = [] | |
foundNewline = false | |
for x in toks | |
if x.kind == 'newline' | |
foundNewline = true | |
hasFound = true | |
groups.push(group) if group.vals.length > 0 | |
group = makeVals('line', []) | |
else | |
group.vals.push(x) | |
groups.push(group) if group.vals.length > 0 | |
if foundNewline | |
return groups | |
# If there are any non-lines then wrap in a line | |
for tok in toks | |
if _.isObject(tok) and tok.kind != 'line' | |
return [makeVals('line', toks)] | |
# Otherwise return the lines | |
return toks | |
# Partitions an array of tokens by one token into left and right sides | |
lpartition = (toks, kind, splitter) -> | |
for tok, i in toks | |
if _.isEqual(tok, splitter) | |
lefts = _.slice(toks, 0, i) | |
rights = _.slice(toks, i + 1) | |
return makeLeftsRights(kind, lefts, rights) | |
return null | |
Parsing._lpartitionBy = (toks, kind, splitter) -> | |
for tok, i in toks | |
if _.isEqual(tok, splitter) | |
lefts = _.slice(toks, 0, i) | |
rights = _.slice(toks, i + 1) | |
return [ makeLeftsRights(kind, lefts, rights) ] | |
return toks | |
Parsing._rpartitionBy = (toks, kind, splitter) -> | |
for tok, i in toks by -1 | |
if _.isEqual(tok, splitter) | |
lefts = _.slice(toks, 0, i) | |
rights = _.slice(toks, i + 1) | |
return [ makeLeftsRights(kind, lefts, rights) ] | |
return toks | |
# ------------------------- | |
# --- Walking functions --- | |
## TODO: I think walk should not only walk through vals, but also 'left' and 'right'. Then we don't have to have trees that are full of ... | |
## TODO: also I think there should be some support for a .table property that is a 2D array | |
# { kind, val: Any } -- leaf node | |
# { kind, vals: [Any] } -- branch node | |
# { kind, left: Any, right: Any } -- either node | |
# { kind: lefts: [Any], rights: [Any] } | |
# { kind, table: [[Any]] } -- either node | |
# TBH if we're going to do this then _walk should be able to handle arrays | |
# Walk through the nodes of a tree. | |
# before(x, isLeaf) walks outside-inwards | |
# after(x, isLeaf walks inside-outwards | |
# ... think the order of <begin> tags in XML, vs the order of </end> tags | |
# this function is a total mess! | |
# need to simply and generalise! perhaps keep a stack of parents | |
# and turn before/after into functions that RETURN the new array instead of trying to set it | |
runNotifs = (obj, stack, callback) -> | |
if not callback? | |
return | |
for ak in arrayKeys | |
vals = obj[ak] | |
if vals? | |
result = callback(stack, vals) | |
if result? | |
obj[ak] = result | |
Parsing._walk = (obj, stack, before, after, leaf=null) -> | |
if _.isArray(obj) | |
arr = obj | |
if before | |
result = before(stack, arr) | |
if result? | |
replaceArrayInto(arr, before(stack, arr)) | |
for subobj in arr | |
Parsing._walk(subobj, stack, before, after, leaf) | |
if after | |
result = after(stack, arr) | |
if result? | |
replaceArrayInto(arr, result) | |
return | |
# Is this a leaf node (it contains no arrayKeys) | |
isLeaf = true | |
for ak in arrayKeys | |
if obj[ak]? | |
isLeaf = false | |
if isLeaf | |
if before or after | |
stack.push(obj) | |
leaf(stack, obj) if leaf | |
stack.pop(obj) | |
return | |
stack.push(obj) | |
runNotifs(obj, stack, before) | |
for ak in arrayKeys | |
vals = obj[ak] | |
continue if not vals? | |
for x in vals | |
Parsing._walk(x, stack, before, after, leaf) | |
runNotifs(obj, stack, after) | |
stack.pop() | |
Parsing._mapTreeInPlace = (tree, mapping) -> | |
Parsing._walk tree, [], null, (stack, vals) -> | |
return mapping(vals, last(stack)) | |
### | |
Parsing._walkOld = (tree, before, after) -> | |
walkInner = (x) -> | |
# if x is an array, handle it specifically | |
if _.isArray(x) | |
arr = x | |
before(null, arr, 'table', true) if before | |
for cell in arr | |
Parsing._walk(cell, before, after) | |
after(null, arr, 'table', true) if after | |
# if x has a table property, it will contain an array of arrays | |
else if x.table? | |
if before | |
for row in x.table | |
before(x, row, 'table', true) | |
for row in x.table | |
for cell in row | |
Parsing._walk(cell, before, after) | |
if after | |
for row in x.table | |
after(x, row, 'table', true) | |
# if x is a branch node, then it may contain .vals, .lefts, .rights, or a mixture of the three | |
else if x.vals? or x.lefts? or x.rights? | |
if before | |
before(x, x.vals, 'vals') if x.vals? | |
before(x, x.lefts, 'lefts') if x.lefts? | |
before(x, x.rights, 'rights') if x.rights? | |
Parsing._walk(x, before, after) | |
if after | |
after(x, x.vals, 'vals') if x.vals? | |
after(x, x.lefts, 'lefts') if x.lefts? | |
after(x, x.rights, 'rights') if x.rights? | |
# otherwise x is a leaf node | |
else | |
before(x, null, null) if before | |
after(x, null, null) if after | |
before(tree, tree.vals, 'vals') if before and tree.vals? | |
before(tree, tree.lefts, 'lefts') if before and tree.lefts? | |
before(tree, tree.rights, 'rights') if before and tree.rights? | |
if tree.vals? | |
for x in tree.vals | |
walkInner(x) | |
if tree.lefts? | |
for x in tree.lefts | |
walkInner(x) | |
if tree.rights? | |
for x in tree.rights | |
walkInner(x) | |
after(tree, tree.vals, 'vals') if after and tree.vals? | |
after(tree, tree.lefts, 'lefts') if after and tree.lefts? | |
after(tree, tree.rights, 'rights') if after and tree.rights? | |
### | |
# -------------------- | |
# --- Tokenization --- | |
Parsing.tokenize = (txt) -> | |
return new Tokenizer(txt).tokenize() | |
# -------------------- | |
# --- Test it out! --- | |
# txt = 'foo = bar = baz = 1, 2; 3, 4' | |
txt = '''foo { | |
( | |
abc; abc | |
) | |
} | |
''' | |
# txt = #'a * b + c * d + e' | |
txt = ''' | |
for x, y in y { | |
print(foo) | |
} | |
''' | |
#outer( inner(a, b) (a, b) ) (a, b) | |
toks = Parsing.tokenize(txt) | |
toks = Parsing._brackets(toks) | |
root = makeVals('block', toks) | |
# (lowest precedence) | |
# - newline stage: parse out newlines from inside {blocks} | |
Parsing._mapTreeInPlace root, (toks, element) -> | |
if element.kind == 'block' | |
# If we are in a {...} block, then each val must be a line | |
Parsing._splitLines(toks) | |
else | |
# Remove extraneous newlines from the output | |
_.filter(toks, (x) -> (x.kind != 'newline')) | |
# - predictive stage: prefixes on the starts of lines | |
predictiveKeywords = ["let", "var", "for", "while"] | |
predictiveParslets = {} | |
predictiveParslets.for = (toks) -> | |
# e.g. for _ in _ {block} | |
# splits on the in | |
[firsts..., end] = toks | |
firsts = toks # todo: we shove the block at the end of 'rights' for now | |
console.log(firsts, end) | |
leftsrights = lpartition(firsts, 'for', makeVal('ident', 'in')) | |
# leftsrights.block = end --- todo: implement the .block property | |
Parsing._mapTreeInPlace root, (toks, element) -> | |
if element.kind == 'line' and toks.length | |
tok = toks[0] | |
for kw in predictiveKeywords | |
if isVal(tok, 'ident', kw) | |
rest = _.tail(toks) | |
obj = makeVals(kw, rest) | |
if kw of predictiveParslets | |
obj = predictiveParslets[kw](rest) | |
return [obj] | |
return toks | |
# - semicolons and commas | |
Parsing._mapTreeInPlace root, (toks) -> Parsing._splitByIntoTable(toks, 'bindings', makeVal('punct', '=')) | |
Parsing._mapTreeInPlace root, (toks) -> Parsing._splitByIntoTable(toks, 'semicolons', makeVal('punct', ';')) | |
Parsing._mapTreeInPlace root, (toks) -> Parsing._splitByIntoTable(toks, 'commas', makeVal('punct', ',')) | |
# while true | |
# before = _.cloneDeep(root) | |
# | |
# # - colons make left associative 'pairs' | |
# Parsing._mapTreeInPlace root, (toks) -> Parsing._lpartitionBy(toks, 'pair', makeVal('punct', ':')) | |
# | |
# # now would be good to do operator parsing | |
# # and actually the below does not work. | |
# # Parsing._mapTreeInPlace root, (toks) -> Parsing._lpartitionBy(toks, 'add', makeVal('punct', '+')) | |
# # Parsing._mapTreeInPlace root, (toks) -> Parsing._lpartitionBy(toks, 'sub', makeVal('punct', '-')) | |
# | |
# # Parsing._mapTreeInPlace root, (toks) -> Parsing._lpartitionBy(toks, 'mul', makeVal('punct', '*')) | |
# # Parsing._mapTreeInPlace root, (toks) -> Parsing._lpartitionBy(toks, 'div', makeVal('punct', '/')) | |
# | |
# # TODO: need to recursively apply stuff like pairs. best to use a while loop | |
# break if _.isEqual(root, before) | |
Specific = {} | |
Specific.call = (toks, parent) -> | |
output = [] | |
for tok in toks | |
if tok.kind == 'group' and output.length | |
# whatever is immediately before gets slurped up into a call | |
prev = output.pop() | |
output.push { | |
kind: 'call' | |
# vals: [prev, makeVal('spacer', ' '), tok] | |
left: prev | |
right: tok | |
} | |
else | |
output.push(tok) | |
return output | |
Parsing._mapTreeInPlace root, (toks, parent) -> Specific.call(toks, parent) | |
# Parsing._mapTreeInPlace root, (toks, parent) -> Specific.call(toks, parent) | |
# f = (stack, vals) -> | |
# # console.log("@INKI", JSON.stringify(stack, null, 2)) | |
# return vals | |
# g = (stack, vals) -> | |
# # console.log("@OUTKI", JSON.stringify(stack, null, 2)) | |
# return vals | |
# Parsing._walk root, [], f, g, null | |
### | |
# call (left high) -- <any> <group> | |
# subscript (left high) -- <any> <square> | |
# for (left predictive) -- line^ for <^in> in ... <block> $line | |
# while (left predictive) -- line^ while ... <block> $line | |
# return | |
# ~ Literals ~ | |
# array: square of exprs | |
# object: block | |
### | |
printTree(root) | |
printJSON(root) | |
splitByAs = (root, outerKind, innerKind, splitter) -> | |
Parsing._mapTreeInPlace root, (toks) -> | |
Parsing._splitBy(toks, outerKind, innerKind, splitter) | |
return | |
txt = '''def foo(x, y) { | |
1, 2, 3 | |
} | |
''' | |
# printJSON(root) | |
## todo: partitioning and infix operators, and context sensitive parsing | |
# e.g. A -> B (right assoc) | |
# need infix and suffix operator parsing | |
# need to parse stuff like `* group` (function call) and `* square` (subscripting) | |
# splitByAs(root, 'bindings', 'binding', makeVal('punct', '=')) | |
# splitByAs(root, 'semicolons', 'semicolon', makeVal('punct', ';')) | |
# splitByAs(root, 'commas', 'comma', makeVal('punct', ',')) | |
# (high precedence) | |
# splitByAs(root, 'lines', makeVal('newline', '\n')) | |
# High precedence ^^^ | |
# printJSON(root) | |
# XS = [1, 2, 3] | |
# YS = [10, 20, 30, 40, 50] | |
# | |
# replaceArrayInto(XS, YS) | |
# console.log XS |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
masterful, but i invade invitation. well done