Created
November 30, 2020 11:31
-
-
Save buhichan/b1e58da28a4bf0a389d86dd451513787 to your computer and use it in GitHub Desktop.
An attempt to implement antlr-like functionality in typescript.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const EOF = Symbol("EOF") | |
type TokenStream = { | |
token: Token | |
start: number | |
end: number | |
}[] | |
type Token = { | |
kind: "token" | |
name: string | |
regex: RegExp | |
} | |
type ParserRule = { | |
kind: "rule" | |
name: string | |
tokens: () => SubRule | |
} | |
type SubRule = Token | ParserRule | OrSubRule | AndSubRule | OneOrMoreRule | ZeroOrMoreRule | ZeroOrOneRule | |
type OrSubRule = ["and", ...SubRule[]] | |
type AndSubRule = ["or", ...SubRule[]] | |
type OneOrMoreRule = ["+", SubRule] | |
type ZeroOrOneRule = ["?", SubRule] | |
type ZeroOrMoreRule = ["*", SubRule] | |
function and(...r: SubRule[]) { | |
return (["and", ...r] as unknown) as AndSubRule | |
} | |
function or(...r: SubRule[]) { | |
return (["or", ...r] as unknown) as OrSubRule | |
} | |
function oneOrMore(r: SubRule) { | |
return ["+", r] as OneOrMoreRule | |
} | |
function zeroOrMore(r: SubRule) { | |
return ["*", r] as ZeroOrMoreRule | |
} | |
function zeroOrOne(r: SubRule) { | |
return ["?", r] as ZeroOrOneRule | |
} | |
type AST = { | |
kind?: string | |
tokenStart: number | |
tokenEnd: number | |
start: number | |
end: number | |
children?: AST[] | |
} | |
function Lexer(source: string) { | |
const knownTokens: (Token & { skip?: boolean })[] = [] | |
function token(name: string, regex: RegExp, skip?: boolean) { | |
const def = { | |
name, | |
regex, | |
skip, | |
kind: "token" as const, | |
} | |
knownTokens.push(def) | |
return def | |
} | |
return { | |
token, | |
tokenize(): TokenStream { | |
let start = 0 | |
let res: TokenStream = [] | |
matchSource: while (start < source.length) { | |
for (const token of knownTokens) { | |
const match = source.slice(start).match(token.regex) | |
if (match) { | |
const tokenLength = match[0].length | |
const end = start + tokenLength | |
if (!token.skip) { | |
res.push({ | |
token, | |
start, | |
end, | |
}) | |
} | |
start += tokenLength | |
continue matchSource | |
} | |
} | |
throw new Error("unknown token at " + start + ": " + source.slice(start, 5) + "...") | |
} | |
return res | |
}, | |
} | |
} | |
function Parser(tokenStream: TokenStream) { | |
function read(readOffset: number) { | |
let nextToken | |
if ((nextToken = tokenStream[readOffset])) { | |
return nextToken | |
} else { | |
return EOF | |
} | |
} | |
function rule(name: string, tokens: () => SubRule): ParserRule { | |
return { | |
name, | |
tokens, | |
kind: "rule" as const, | |
} | |
} | |
function matchRule(rule: SubRule, offset: number, matchAll: boolean): null | { nodes: AST[]; tokenLength: number } { | |
if (Array.isArray(rule)) { | |
const [op, ...tokenRules] = rule | |
switch (op) { | |
case "and": { | |
const subMatches: AST[] = [] | |
let childOffset = 0 | |
for (const subRule of tokenRules) { | |
const subMatch = matchRule(subRule, offset + childOffset, false) | |
if (subMatch) { | |
childOffset += subMatch.tokenLength | |
subMatches.push(...subMatch.nodes) | |
} else { | |
return null | |
} | |
} | |
return { | |
nodes: subMatches, | |
tokenLength: childOffset, | |
} | |
} | |
case "or": { | |
for (const subRule of tokenRules) { | |
let subMatch = matchRule(subRule, offset, matchAll) | |
if (subMatch && !(matchAll && subMatch.tokenLength + offset < tokenStream.length)) { | |
return subMatch | |
} | |
} | |
return null | |
} | |
case "?": | |
case "+": | |
case "*": { | |
const subRule = tokenRules[0] | |
let childOffset = offset | |
let subMatches = [] | |
while (childOffset < tokens.length) { | |
const subMatch = matchRule(subRule, childOffset, false) | |
if (subMatch) { | |
childOffset += subMatch.tokenLength | |
subMatches.push(...subMatch.nodes) | |
if (op === "?") { | |
break | |
} | |
} else { | |
break | |
} | |
} | |
if (!subMatches.length) { | |
if (op === "*") { | |
return { | |
nodes: [], | |
tokenLength: 0, | |
} | |
} else { | |
return null | |
} | |
} else { | |
return { | |
nodes: subMatches, | |
tokenLength: childOffset - offset, | |
} | |
} | |
} | |
} | |
} else if (rule.kind === "rule") { | |
const subRules = rule.tokens() | |
const subMatch = matchRule(subRules, offset, matchAll) | |
if (subMatch) { | |
if (!subMatch.nodes.length) { | |
return { | |
nodes: [], | |
tokenLength: 0, | |
} | |
} | |
const sourceLocation = sourceCode.slice(subMatch.nodes[0].start, subMatch.nodes[subMatch.nodes.length - 1].end) | |
console.log(`matched rule ${rule.name} at ${sourceLocation}`) | |
return { | |
nodes: [ | |
{ | |
kind: rule.name, | |
tokenStart: offset, | |
tokenEnd: offset + subMatch.tokenLength, | |
start: subMatch.nodes[0].start, | |
end: subMatch.nodes[subMatch.nodes.length - 1].end, | |
children: subMatch.nodes, | |
}, | |
], | |
tokenLength: subMatch.tokenLength, | |
} | |
} else { | |
return null | |
} | |
} else { | |
const nextToken = read(offset) | |
if (nextToken === EOF) { | |
return null | |
} else if (nextToken.token === rule) { | |
return { | |
nodes: [ | |
{ | |
kind: rule.name, | |
tokenStart: offset, | |
start: nextToken.start, | |
end: nextToken.end, | |
tokenEnd: offset + 1, | |
}, | |
], | |
tokenLength: 1, | |
} | |
} else { | |
return null | |
} | |
} | |
} | |
function match(expr: ParserRule) { | |
return matchRule(expr, 0, true) | |
} | |
return { | |
rule, | |
read, | |
match, | |
} | |
} | |
const sourceCode = ` | |
let a = 3 | |
let b = (10 + a) * 3 / 2 | |
let c = a * b / 2 | |
` | |
const lexer = Lexer(sourceCode) | |
const Let = lexer.token("LetKeyword", /^let/) | |
const Equal = lexer.token("Equal", /^\=/) | |
const Num = lexer.token("Number", /^\d+/) | |
const Add = lexer.token("Add", /^\+/) | |
const Id = lexer.token("Identifier", /^[a-z_A-Z]+/) | |
const Mul = lexer.token("Multiply", /^\*/) | |
const Div = lexer.token("Divide", /^\//) | |
const WS = lexer.token("WhiteSpace", /^(\s|\n)+/, true) | |
const LBracket = lexer.token("LeftBracket", /^\(/) | |
const RBracket = lexer.token("RightBracket", /^\)/) | |
const tokens = lexer.tokenize() | |
const someLangParser = Parser(tokens) | |
const mul = someLangParser.rule("mul", () => and(Mul, expr)) | |
const div = someLangParser.rule("div", () => and(Div, expr)) | |
const add = someLangParser.rule("add", () => and(Add, expr)) | |
const expr = someLangParser.rule("expr", () => | |
//prettier-ignore | |
or( | |
and( | |
or( | |
Id, | |
Num, | |
and( | |
LBracket, expr, RBracket | |
) | |
), | |
or(mul, div, add) | |
), | |
and( | |
LBracket, expr, RBracket | |
), | |
Id, | |
Num | |
) | |
) | |
const decl = someLangParser.rule("decl", () => { | |
return and(Let, Id, Equal, expr) | |
}) | |
const file = someLangParser.rule("file", () => { | |
return zeroOrMore(decl) | |
}) | |
const ast = someLangParser.match(file) | |
ast && printAst(ast.nodes[0]) | |
function printAst(ast: AST, indent = 0) { | |
if (ast.kind) { | |
const tokenStart = tokens[ast.tokenStart] | |
const tokenEnd = tokens[ast.tokenEnd - 1] | |
console.log(`${" ".repeat(indent)} ${ast.kind} "${sourceCode.slice(tokenStart.start, tokenEnd.end)}"`) | |
} | |
if (ast.children) { | |
ast.children.forEach(x => printAst(x, indent + 1)) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment