Last active
January 10, 2021 16:41
-
-
Save jaens/3b27ac1907cf0149ac06172760e5a697 to your computer and use it in GitHub Desktop.
Quick example of an LL(1) incremental parser in TypeScript
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
type RuleName = string; | |
type TokenName = string; | |
type Ast = { type: string; args: Ast[] }; | |
// Stack frame for executing grammar rules | |
type Frame = { | |
/** Current rule being executed */ | |
rule: RuleName; | |
/** Position within instruction array */ | |
ipos: number; | |
/** AST node results from any other rules that have been called */ | |
astList: Ast[]; | |
/** The final AST result of this node (for use in next run) */ | |
result: Ast | null; | |
/** Nodes for the currently repeating production */ | |
repeatList: Ast[] | null; | |
}; | |
// TODO: Add an "optional" and "alternative" instruction | |
type BasicInstruction = ["repeat", RuleName]; | |
/** The grammar uses short string forms for token & rule instructions which are "simplified" before execution */ | |
type MacroInstruction = BasicInstruction | TokenName | RuleName; | |
type SimpleInstruction = | |
| BasicInstruction | |
| ["token", TokenName] | |
| ["call", RuleName] | |
| ["return"]; | |
// The grammar. | |
// Non-terminals are called "rules". Rules are basically equivalent to procedures. | |
type Rules = { [name: string]: MacroInstruction[] }; | |
type State = { | |
lexPos: number; | |
frames: Frame[]; | |
}; | |
/** Map of input_position -> state (stored as a sparse array) */ | |
type StateCache = State[]; | |
/** Represents a change to the parser's input (currently mostly unused) */ | |
type Diff = { | |
lastUsedCachePos: number; | |
}; | |
type FirstSets = { [name: string]: TokenName[] }; | |
type Parser = { | |
rules: Rules; | |
firsts: FirstSets; | |
state: State; | |
input: string; | |
cache: StateCache; | |
result?: Ast; | |
diff?: Diff; | |
}; | |
// | |
// Utilities | |
// | |
function last<T>(arr: T[]) { | |
return arr[arr.length - 1]; | |
} | |
function error(msg: string): never { | |
throw new Error(msg); | |
} | |
function logObj(obj: any, depth: number = null) { | |
console.dir(obj, { depth }); | |
} | |
/** Find the common prefix/suffix length of two strings */ | |
function commonLength(a: string, b: string, suffix: boolean) { | |
function c(s: string, pos: number) { | |
return s.charCodeAt(suffix ? s.length - 1 - pos : pos); | |
} | |
let pos = 0; | |
while (pos < a.length && pos < b.length && c(a, pos) === c(b, pos)) pos++; | |
return pos; | |
} | |
/** Check if two arrays are equal using custom equality function `eq` */ | |
function arraysEqual<T>( | |
a: T[] | null, | |
b: T[] | null, | |
eq: (a: T, b: T) => boolean | |
) { | |
return ( | |
a === b || | |
(a != null && | |
b != null && | |
a.length === b.length && | |
a.every((n, idx) => eq(n, b[idx]))) | |
); | |
} | |
// | |
// First sets for LL(1) parsing (very basic and inefficient) | |
// | |
function buildFirst(rules: Rules, rule: RuleName): TokenName[] { | |
const insn = simplify(rules[rule][0]); | |
switch (insn[0]) { | |
case "token": | |
return [insn[1]]; | |
case "repeat": | |
case "call": | |
return buildFirst(rules, insn[1]); | |
case "return": | |
return []; | |
} | |
} | |
function buildFirstSets(rules: Rules) { | |
const firsts: FirstSets = {}; | |
for (const rule in rules) { | |
firsts[rule] = buildFirst(rules, rule); | |
} | |
return firsts; | |
} | |
// | |
// State management | |
// | |
function initialState(): State { | |
return { | |
lexPos: 0, | |
frames: [makeCallFrame("program")], | |
}; | |
} | |
function copyState(s: State): State { | |
// This should copy anything `parse()` can mutate (so states can be persistently stored in the cache) | |
function copyFrame(f: Frame): Frame { | |
return { | |
...f, | |
astList: [...f.astList], | |
repeatList: f.repeatList && [...f.repeatList], | |
}; | |
} | |
return { | |
lexPos: s.lexPos, | |
frames: s.frames.map(copyFrame), | |
}; | |
} | |
function framesMatch(fa: Frame, fb: Frame) { | |
return ( | |
fa.ipos == fb.ipos && | |
fa.rule === fb.rule && | |
arraysEqual(fa.repeatList, fb.repeatList, astMatch) | |
); | |
} | |
function astMatch(a: Ast, b: Ast): boolean { | |
// Since AST nodes are immutable within a single parse pass, we can also try an identity comparison first | |
return ( | |
a === b || (a.type === b.type && arraysEqual(a.args, b.args, astMatch)) | |
); | |
} | |
function parentAstMatch(fa: Frame[], fb: Frame[]) { | |
// Do not compare the current frame since it will be "fixed up" | |
const currentFrameIdx = fa.length - 1; | |
return ( | |
fa.length === fb.length && | |
fa.every((f, idx) => | |
idx === currentFrameIdx | |
? true | |
: arraysEqual(f.astList, fb[idx].astList, astMatch) | |
) | |
); | |
} | |
/** Check if two parsing states match ie. if after a possible "fix up", | |
* proceeding from these two states is guaranteed to have an identical result. */ | |
function statesMatch(sa: State, sb: State) { | |
return ( | |
arraysEqual(sa.frames, sb.frames, framesMatch) && | |
parentAstMatch(sa.frames, sb.frames) | |
); | |
} | |
// | |
// Instructions | |
// | |
// NOTE: The input is a string with each character representing one token (real parsers would have a lexer) | |
function isToken(s: string) { | |
return s.length === 1; | |
} | |
function simplify(insn: MacroInstruction): SimpleInstruction { | |
if (typeof insn === "string") { | |
if (isToken(insn)) { | |
return ["token", insn]; | |
} else { | |
return ["call", insn]; | |
} | |
} else if (typeof insn === "undefined") { | |
// Treat reading past the end of the instructions array as a "return" instruction | |
return ["return"]; | |
} else if (Array.isArray(insn)) { | |
return insn; | |
} else { | |
error(`unimplemented instruction ${insn}`); | |
} | |
} | |
// | |
// Execution | |
// | |
function makeCallFrame(rule: RuleName): Frame { | |
return { | |
rule, | |
ipos: 0, | |
astList: [], | |
repeatList: null, | |
result: null, | |
}; | |
} | |
function shouldSync(rule: RuleName) { | |
return rule === "function"; | |
} | |
/** Execute one step in the parser state machine */ | |
function parse(parser: Parser): boolean { | |
const { rules, firsts, state, input, cache } = parser; | |
function beginCall(rule: string) { | |
state.frames.push(makeCallFrame(rule)); | |
} | |
function expect(t: string) { | |
if (token === t) { | |
state.lexPos++; | |
} else { | |
error(`expected ${insn}, got ${token}`); | |
} | |
} | |
function topFrame() { | |
return last(state.frames); | |
} | |
// Read current instruction | |
const frame = topFrame(); | |
const { rule, ipos } = frame; | |
const insn = simplify(rules[rule][ipos]); | |
// Read current token | |
const { lexPos } = state; | |
const token = input.charAt(lexPos); | |
console.log("execute", { rule, insn, token }); | |
switch (insn[0]) { | |
case "token": | |
expect(insn[1]); | |
frame.ipos++; | |
break; | |
case "call": | |
// Pre-increment so call will return to following instruction | |
frame.ipos++; | |
beginCall(insn[1]); | |
break; | |
case "return": { | |
// Build resulting AST node for the current rule | |
const returnValue = makeAst(rule, frame.astList, null); | |
// Cache AST node for use in next incremental run | |
frame.result = returnValue; | |
if (shouldSync(rule)) { | |
// Try to match up with previous state if we are incrementally parsing | |
const { diff } = parser; | |
if (diff) { | |
const prevState = cache[lexPos]; | |
if (prevState) { | |
console.log("Encountered previous state at", { | |
lexPos, | |
}); | |
printState(prevState); | |
printState(state); | |
if (statesMatch(prevState, state)) { | |
console.log("States match, returning"); | |
// Mutate the result of this rule from the last run in-place to update it with the new nodes | |
const prevResult = last(prevState.frames).result; | |
makeAst(rule, frame.astList, prevResult); | |
return true; | |
} else { | |
console.log("State mismatch, continuing"); | |
} | |
} | |
// Clear previously cached values (which are now invalid because the input has changed) | |
console.log("Clearing old cache", { | |
from: diff.lastUsedCachePos, | |
to: lexPos, | |
}); | |
cache.fill(null, diff.lastUsedCachePos, lexPos); | |
diff.lastUsedCachePos = lexPos; | |
} | |
// Store current state into cache | |
cache[lexPos] = copyState(state); | |
} | |
state.frames.pop(); | |
if (state.frames.length > 0) { | |
// Append the resulting node of the current rule to the node list of the caller | |
topFrame().astList.push(returnValue); | |
} else { | |
console.log("done"); | |
parser.result = returnValue; | |
return true; | |
} | |
break; | |
} | |
case "repeat": { | |
if (frame.repeatList === null) { | |
// First time executing "repeat" | |
frame.repeatList = []; | |
} else { | |
// Add returned node from call to repeated node list | |
frame.repeatList.push(frame.astList.pop()); | |
} | |
// Check if the wanted rule will follow (based on the first set of that rule) | |
const rule = insn[1]; | |
const first = firsts[rule]; | |
if (first.includes(token)) { | |
beginCall(rule); | |
} else { | |
// We are done repeating, proceed | |
frame.astList.push({ type: "list", args: frame.repeatList }); | |
frame.ipos++; | |
frame.repeatList = null; | |
} | |
break; | |
} | |
} | |
return false; | |
} | |
/** Build an AST node or update it, depending on whether `previous` is set. */ | |
function makeAst(rule: RuleName, args: Ast[], previous: Ast | null): Ast { | |
console.log("makeAst", { rule, args, previous }); | |
if (previous) { | |
previous.args = args; | |
} else { | |
return { type: rule, args }; | |
} | |
} | |
// | |
// Incremental parsing utilities | |
// | |
function findLastValidState(cache: StateCache, startPos: number) { | |
let res = null; | |
for (const pos in cache) { | |
if (+pos < startPos) { | |
res = cache[pos]; | |
} else { | |
break; | |
} | |
} | |
return res; | |
} | |
function changeInput(parser: Parser, input: string) { | |
const prevInput = parser.input; | |
const prefixLen = commonLength(input, prevInput, false); | |
const suffixLen = commonLength(input, prevInput, true); | |
const lenDelta = input.length - prevInput.length; | |
const continueState = | |
findLastValidState(parser.cache, prefixLen) ?? initialState(); | |
logObj({ prefixLen, suffixLen, lenDelta, state: continueState }, 3); | |
// Update the positions of the previously cached states | |
if (lenDelta > 0) { | |
// Shift previously cached values to the right | |
parser.cache.length = input.length; // NOTE: This deletes the "EOF" cache element | |
parser.cache.copyWithin( | |
input.length - suffixLen, | |
prevInput.length - suffixLen | |
); | |
// Clear cache for current state to prevent immediate cache match | |
delete parser.cache[continueState.lexPos]; | |
// Clear cache for the part that changed | |
for (let i = prefixLen; i < input.length - suffixLen; i++) | |
delete parser.cache[i]; | |
printCache(parser.cache); | |
} else { | |
error("deleting text unimplemented"); | |
} | |
// Update parser state to continue from last valid position | |
parser.diff = { | |
lastUsedCachePos: continueState.lexPos, | |
}; | |
parser.input = input; | |
parser.state = continueState; | |
} | |
// | |
// Debugging | |
// | |
function printAst(a: Ast, level = 0) { | |
if (!a) { | |
error(`ast missing`); | |
} | |
let s = ""; | |
for (let i = 0; i < level - 1; i++) s += " "; | |
if (level > 0) { | |
s += "+-- "; | |
} | |
s += `${a.type}`; | |
console.log(s); | |
for (let arg of a.args) { | |
printAst(arg, level + 1); | |
} | |
} | |
function frameToString(f: Frame) { | |
const ip = `${f.rule} @${f.ipos}`; | |
let s = `${ip.padEnd(15)} - ${f.astList.length} nodes`; | |
if (f.repeatList !== null) { | |
s += ` repeating(${f.repeatList.length})`; | |
} | |
return s; | |
} | |
function printCache(c: StateCache) { | |
console.log(); | |
console.log("cache {"); | |
let i = 0; | |
for (;;) { | |
let gap = 0; | |
while (i < c.length && c[i] == null) { | |
gap++; | |
i++; | |
} | |
if (gap > 0) console.log(`<${gap} empty slots>`); | |
if (i >= c.length) break; | |
const s = c[i]; | |
console.log(`@${i}: lexPos=${s.lexPos}, frames:`); | |
for (const f of s.frames) { | |
console.log(` ${frameToString(f)}`); | |
} | |
i++; | |
} | |
console.log("} end cache"); | |
} | |
function printFrame(f: Frame) { | |
console.log(" " + frameToString(f)); | |
for (const a of f.astList) { | |
printAst(a, 2); | |
} | |
} | |
function printState(s: State) { | |
console.log(); | |
console.log(`State @ ${s.lexPos} {`); | |
for (const f of s.frames) { | |
printFrame(f); | |
console.log(); | |
} | |
console.log("} end state"); | |
} | |
// | |
// Main | |
// | |
const rules: Rules = { | |
program: [["repeat", "function"]], | |
function: ["identifier", "{", ["repeat", "statement"], "}"], | |
statement: ["identifier", "=", "identifier", ";"], | |
identifier: ["n"], | |
}; | |
function runParser(parser: Parser, maxSteps = 100) { | |
for (let i = 0; i < maxSteps; i++) { | |
const done = parse(parser); | |
if (done) return parser.result; | |
} | |
} | |
function main() { | |
const input = "n{}n{n=n;}n{}"; | |
const newInput = "n{}n{n=n;n=n;n=n;}n{}"; | |
const parser: Parser = { | |
rules, | |
input, | |
state: initialState(), | |
firsts: buildFirstSets(rules), | |
cache: [], | |
}; | |
console.log("firsts", parser.firsts); | |
const result = runParser(parser); | |
printAst(result); | |
{ | |
console.log("Running incrementally"); | |
printCache(parser.cache); | |
changeInput(parser, newInput); | |
const result = runParser(parser); | |
printAst(result); | |
} | |
} | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment