Skip to content

Instantly share code, notes, and snippets.

@jaens
Last active January 10, 2021 16:41
Show Gist options
  • Save jaens/3b27ac1907cf0149ac06172760e5a697 to your computer and use it in GitHub Desktop.
Save jaens/3b27ac1907cf0149ac06172760e5a697 to your computer and use it in GitHub Desktop.
Quick example of an LL(1) incremental parser in TypeScript
type RuleName = string;
type TokenName = string;
type Ast = { type: string; args: Ast[] };
// Stack frame for executing grammar rules
type Frame = {
/** Current rule being executed */
rule: RuleName;
/** Position within instruction array */
ipos: number;
/** AST node results from any other rules that have been called */
astList: Ast[];
/** The final AST result of this node (for use in next run) */
result: Ast | null;
/** Nodes for the currently repeating production */
repeatList: Ast[] | null;
};
// TODO: Add an "optional" and "alternative" instruction
type BasicInstruction = ["repeat", RuleName];
/** The grammar uses short string forms for token & rule instructions which are "simplified" before execution */
type MacroInstruction = BasicInstruction | TokenName | RuleName;
type SimpleInstruction =
| BasicInstruction
| ["token", TokenName]
| ["call", RuleName]
| ["return"];
// The grammar.
// Non-terminals are called "rules". Rules are basically equivalent to procedures.
type Rules = { [name: string]: MacroInstruction[] };
type State = {
lexPos: number;
frames: Frame[];
};
/** Map of input_position -> state (stored as a sparse array) */
type StateCache = State[];
/** Represents a change to the parser's input (currently mostly unused) */
type Diff = {
lastUsedCachePos: number;
};
type FirstSets = { [name: string]: TokenName[] };
type Parser = {
rules: Rules;
firsts: FirstSets;
state: State;
input: string;
cache: StateCache;
result?: Ast;
diff?: Diff;
};
//
// Utilities
//
function last<T>(arr: T[]) {
return arr[arr.length - 1];
}
function error(msg: string): never {
throw new Error(msg);
}
function logObj(obj: any, depth: number = null) {
console.dir(obj, { depth });
}
/** Find the common prefix/suffix length of two strings */
function commonLength(a: string, b: string, suffix: boolean) {
function c(s: string, pos: number) {
return s.charCodeAt(suffix ? s.length - 1 - pos : pos);
}
let pos = 0;
while (pos < a.length && pos < b.length && c(a, pos) === c(b, pos)) pos++;
return pos;
}
/** Check if two arrays are equal using custom equality function `eq` */
function arraysEqual<T>(
a: T[] | null,
b: T[] | null,
eq: (a: T, b: T) => boolean
) {
return (
a === b ||
(a != null &&
b != null &&
a.length === b.length &&
a.every((n, idx) => eq(n, b[idx])))
);
}
//
// First sets for LL(1) parsing (very basic and inefficient)
//
function buildFirst(rules: Rules, rule: RuleName): TokenName[] {
const insn = simplify(rules[rule][0]);
switch (insn[0]) {
case "token":
return [insn[1]];
case "repeat":
case "call":
return buildFirst(rules, insn[1]);
case "return":
return [];
}
}
function buildFirstSets(rules: Rules) {
const firsts: FirstSets = {};
for (const rule in rules) {
firsts[rule] = buildFirst(rules, rule);
}
return firsts;
}
//
// State management
//
function initialState(): State {
return {
lexPos: 0,
frames: [makeCallFrame("program")],
};
}
function copyState(s: State): State {
// This should copy anything `parse()` can mutate (so states can be persistently stored in the cache)
function copyFrame(f: Frame): Frame {
return {
...f,
astList: [...f.astList],
repeatList: f.repeatList && [...f.repeatList],
};
}
return {
lexPos: s.lexPos,
frames: s.frames.map(copyFrame),
};
}
function framesMatch(fa: Frame, fb: Frame) {
return (
fa.ipos == fb.ipos &&
fa.rule === fb.rule &&
arraysEqual(fa.repeatList, fb.repeatList, astMatch)
);
}
function astMatch(a: Ast, b: Ast): boolean {
// Since AST nodes are immutable within a single parse pass, we can also try an identity comparison first
return (
a === b || (a.type === b.type && arraysEqual(a.args, b.args, astMatch))
);
}
function parentAstMatch(fa: Frame[], fb: Frame[]) {
// Do not compare the current frame since it will be "fixed up"
const currentFrameIdx = fa.length - 1;
return (
fa.length === fb.length &&
fa.every((f, idx) =>
idx === currentFrameIdx
? true
: arraysEqual(f.astList, fb[idx].astList, astMatch)
)
);
}
/** Check if two parsing states match ie. if after a possible "fix up",
* proceeding from these two states is guaranteed to have an identical result. */
function statesMatch(sa: State, sb: State) {
return (
arraysEqual(sa.frames, sb.frames, framesMatch) &&
parentAstMatch(sa.frames, sb.frames)
);
}
//
// Instructions
//
// NOTE: The input is a string with each character representing one token (real parsers would have a lexer)
function isToken(s: string) {
return s.length === 1;
}
function simplify(insn: MacroInstruction): SimpleInstruction {
if (typeof insn === "string") {
if (isToken(insn)) {
return ["token", insn];
} else {
return ["call", insn];
}
} else if (typeof insn === "undefined") {
// Treat reading past the end of the instructions array as a "return" instruction
return ["return"];
} else if (Array.isArray(insn)) {
return insn;
} else {
error(`unimplemented instruction ${insn}`);
}
}
//
// Execution
//
function makeCallFrame(rule: RuleName): Frame {
return {
rule,
ipos: 0,
astList: [],
repeatList: null,
result: null,
};
}
function shouldSync(rule: RuleName) {
return rule === "function";
}
/** Execute one step in the parser state machine */
function parse(parser: Parser): boolean {
const { rules, firsts, state, input, cache } = parser;
function beginCall(rule: string) {
state.frames.push(makeCallFrame(rule));
}
function expect(t: string) {
if (token === t) {
state.lexPos++;
} else {
error(`expected ${insn}, got ${token}`);
}
}
function topFrame() {
return last(state.frames);
}
// Read current instruction
const frame = topFrame();
const { rule, ipos } = frame;
const insn = simplify(rules[rule][ipos]);
// Read current token
const { lexPos } = state;
const token = input.charAt(lexPos);
console.log("execute", { rule, insn, token });
switch (insn[0]) {
case "token":
expect(insn[1]);
frame.ipos++;
break;
case "call":
// Pre-increment so call will return to following instruction
frame.ipos++;
beginCall(insn[1]);
break;
case "return": {
// Build resulting AST node for the current rule
const returnValue = makeAst(rule, frame.astList, null);
// Cache AST node for use in next incremental run
frame.result = returnValue;
if (shouldSync(rule)) {
// Try to match up with previous state if we are incrementally parsing
const { diff } = parser;
if (diff) {
const prevState = cache[lexPos];
if (prevState) {
console.log("Encountered previous state at", {
lexPos,
});
printState(prevState);
printState(state);
if (statesMatch(prevState, state)) {
console.log("States match, returning");
// Mutate the result of this rule from the last run in-place to update it with the new nodes
const prevResult = last(prevState.frames).result;
makeAst(rule, frame.astList, prevResult);
return true;
} else {
console.log("State mismatch, continuing");
}
}
// Clear previously cached values (which are now invalid because the input has changed)
console.log("Clearing old cache", {
from: diff.lastUsedCachePos,
to: lexPos,
});
cache.fill(null, diff.lastUsedCachePos, lexPos);
diff.lastUsedCachePos = lexPos;
}
// Store current state into cache
cache[lexPos] = copyState(state);
}
state.frames.pop();
if (state.frames.length > 0) {
// Append the resulting node of the current rule to the node list of the caller
topFrame().astList.push(returnValue);
} else {
console.log("done");
parser.result = returnValue;
return true;
}
break;
}
case "repeat": {
if (frame.repeatList === null) {
// First time executing "repeat"
frame.repeatList = [];
} else {
// Add returned node from call to repeated node list
frame.repeatList.push(frame.astList.pop());
}
// Check if the wanted rule will follow (based on the first set of that rule)
const rule = insn[1];
const first = firsts[rule];
if (first.includes(token)) {
beginCall(rule);
} else {
// We are done repeating, proceed
frame.astList.push({ type: "list", args: frame.repeatList });
frame.ipos++;
frame.repeatList = null;
}
break;
}
}
return false;
}
/** Build an AST node or update it, depending on whether `previous` is set. */
function makeAst(rule: RuleName, args: Ast[], previous: Ast | null): Ast {
console.log("makeAst", { rule, args, previous });
if (previous) {
previous.args = args;
} else {
return { type: rule, args };
}
}
//
// Incremental parsing utilities
//
function findLastValidState(cache: StateCache, startPos: number) {
let res = null;
for (const pos in cache) {
if (+pos < startPos) {
res = cache[pos];
} else {
break;
}
}
return res;
}
function changeInput(parser: Parser, input: string) {
const prevInput = parser.input;
const prefixLen = commonLength(input, prevInput, false);
const suffixLen = commonLength(input, prevInput, true);
const lenDelta = input.length - prevInput.length;
const continueState =
findLastValidState(parser.cache, prefixLen) ?? initialState();
logObj({ prefixLen, suffixLen, lenDelta, state: continueState }, 3);
// Update the positions of the previously cached states
if (lenDelta > 0) {
// Shift previously cached values to the right
parser.cache.length = input.length; // NOTE: This deletes the "EOF" cache element
parser.cache.copyWithin(
input.length - suffixLen,
prevInput.length - suffixLen
);
// Clear cache for current state to prevent immediate cache match
delete parser.cache[continueState.lexPos];
// Clear cache for the part that changed
for (let i = prefixLen; i < input.length - suffixLen; i++)
delete parser.cache[i];
printCache(parser.cache);
} else {
error("deleting text unimplemented");
}
// Update parser state to continue from last valid position
parser.diff = {
lastUsedCachePos: continueState.lexPos,
};
parser.input = input;
parser.state = continueState;
}
//
// Debugging
//
function printAst(a: Ast, level = 0) {
if (!a) {
error(`ast missing`);
}
let s = "";
for (let i = 0; i < level - 1; i++) s += " ";
if (level > 0) {
s += "+-- ";
}
s += `${a.type}`;
console.log(s);
for (let arg of a.args) {
printAst(arg, level + 1);
}
}
function frameToString(f: Frame) {
const ip = `${f.rule} @${f.ipos}`;
let s = `${ip.padEnd(15)} - ${f.astList.length} nodes`;
if (f.repeatList !== null) {
s += ` repeating(${f.repeatList.length})`;
}
return s;
}
function printCache(c: StateCache) {
console.log();
console.log("cache {");
let i = 0;
for (;;) {
let gap = 0;
while (i < c.length && c[i] == null) {
gap++;
i++;
}
if (gap > 0) console.log(`<${gap} empty slots>`);
if (i >= c.length) break;
const s = c[i];
console.log(`@${i}: lexPos=${s.lexPos}, frames:`);
for (const f of s.frames) {
console.log(` ${frameToString(f)}`);
}
i++;
}
console.log("} end cache");
}
function printFrame(f: Frame) {
console.log(" " + frameToString(f));
for (const a of f.astList) {
printAst(a, 2);
}
}
function printState(s: State) {
console.log();
console.log(`State @ ${s.lexPos} {`);
for (const f of s.frames) {
printFrame(f);
console.log();
}
console.log("} end state");
}
//
// Main
//
const rules: Rules = {
program: [["repeat", "function"]],
function: ["identifier", "{", ["repeat", "statement"], "}"],
statement: ["identifier", "=", "identifier", ";"],
identifier: ["n"],
};
function runParser(parser: Parser, maxSteps = 100) {
for (let i = 0; i < maxSteps; i++) {
const done = parse(parser);
if (done) return parser.result;
}
}
function main() {
const input = "n{}n{n=n;}n{}";
const newInput = "n{}n{n=n;n=n;n=n;}n{}";
const parser: Parser = {
rules,
input,
state: initialState(),
firsts: buildFirstSets(rules),
cache: [],
};
console.log("firsts", parser.firsts);
const result = runParser(parser);
printAst(result);
{
console.log("Running incrementally");
printCache(parser.cache);
changeInput(parser, newInput);
const result = runParser(parser);
printAst(result);
}
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment