|
/* |
|
* TODO next time I'm bored: write AST -> source serializer |
|
* and random tree generator to run a fuzzer. |
|
*/ |
|
|
|
const CHAR_a = "a".charCodeAt(0); |
|
const CHAR_z = "z".charCodeAt(0); |
|
const CHAR_A = "A".charCodeAt(0); |
|
const CHAR_Z = "Z".charCodeAt(0); |
|
const CHAR_0 = "0".charCodeAt(0); |
|
const CHAR_9 = "9".charCodeAt(0); |
|
|
|
class SyntaxError extends Error {} |
|
class ParseError extends Error {} |
|
|
|
function* lex(src) { |
|
let p = -1; |
|
let c = ""; |
|
let comment; |
|
let mode = "value"; // how to interpret symbols |
|
|
|
function next() { |
|
c = src[++p]; |
|
console.log("char", JSON.stringify(c)); |
|
} |
|
|
|
function isAlpha() { |
|
if (!c) return false; |
|
const code = c.charCodeAt(0); |
|
if (CHAR_a <= code && code <= CHAR_z) return true; |
|
if (CHAR_A <= code && code <= CHAR_Z) return true; |
|
if (c === "_") return true; |
|
return false; |
|
} |
|
|
|
function isNumber() { |
|
if (!c) return false; |
|
const code = c.charCodeAt(0); |
|
if (CHAR_0 <= code && code <= CHAR_9) return true; |
|
return false; |
|
} |
|
|
|
function isWhitespace() { |
|
return c === " " || c === "\t"; |
|
} |
|
|
|
function eof() { |
|
if (c !== undefined) { |
|
return null; |
|
} |
|
|
|
const begin = p; |
|
const end = p; |
|
return { |
|
type: "eof", |
|
begin, |
|
end, |
|
}; |
|
} |
|
|
|
function whitespace() { |
|
while (isWhitespace()) { |
|
next(); |
|
} |
|
} |
|
|
|
function semicolon() { |
|
const begin = p; |
|
if (c == ";") { |
|
do { |
|
next(); |
|
} while (c == ";"); |
|
return { |
|
type: ";", |
|
begin, |
|
end: p, |
|
}; |
|
} |
|
} |
|
|
|
function id() { |
|
let name = ""; |
|
const begin = p; |
|
|
|
while (isAlpha()) { |
|
name += c; |
|
next(); |
|
} |
|
|
|
const end = p; |
|
|
|
if (!name) { |
|
return null; |
|
} |
|
|
|
return { |
|
type: "id", |
|
begin, |
|
end, |
|
name, |
|
}; |
|
} |
|
|
|
function number() { |
|
let valueStr = ""; |
|
const begin = p; |
|
|
|
while (isNumber()) { |
|
valueStr += c; |
|
next(); |
|
} |
|
|
|
const end = p; |
|
|
|
if (!valueStr) { |
|
return null; |
|
} |
|
|
|
const value = Number(valueStr); |
|
|
|
return { |
|
type: "number", |
|
begin, |
|
end, |
|
value, |
|
}; |
|
} |
|
|
|
function readComment() { |
|
let body = ""; |
|
for (;;) { |
|
while (c != "*") { |
|
body += c; |
|
next(); |
|
} |
|
next(); |
|
if (c == "/") { |
|
next(); |
|
return body; |
|
} |
|
body += "*"; |
|
body += c; |
|
} |
|
} |
|
|
|
function readRegexp() { |
|
const begin = p; |
|
let value = ""; |
|
while (c != "/") { |
|
value += c; |
|
next(); |
|
} |
|
next(); |
|
|
|
return { |
|
type: "regexp", |
|
value, |
|
begin, |
|
end: p, |
|
}; |
|
} |
|
|
|
function operator() { |
|
const begin = p; |
|
|
|
if (c == "=") { |
|
next(); |
|
if (c == "=") { |
|
next(); |
|
return { |
|
type: "==", |
|
begin, |
|
end: p, |
|
}; |
|
} |
|
|
|
return { |
|
type: "=", |
|
begin, |
|
end: p, |
|
}; |
|
} |
|
|
|
if (c == "+") { |
|
next(); |
|
return { |
|
type: "+", |
|
begin, |
|
end: p, |
|
}; |
|
} |
|
|
|
if (c == "*") { |
|
next(); |
|
return { |
|
type: "*", |
|
begin, |
|
end: p, |
|
}; |
|
} |
|
|
|
if (c == "/") { |
|
next(); |
|
|
|
if (c == "*") { |
|
next(); |
|
comment = readComment(); |
|
return "recur"; |
|
} |
|
|
|
if (mode == "value") { |
|
return readRegexp(); |
|
} |
|
|
|
return { |
|
type: "/", |
|
begin, |
|
end: p, |
|
}; |
|
} |
|
|
|
return null; |
|
} |
|
|
|
next(); |
|
for (;;) { |
|
whitespace(); |
|
|
|
const maybeEof = eof(); |
|
if (maybeEof) { |
|
yield maybeEof; |
|
return; |
|
} |
|
|
|
const token = id() || operator() || number() || semicolon(); |
|
|
|
if (token == "recur") { |
|
continue; |
|
} |
|
|
|
if (token) { |
|
if (comment) { |
|
token.comment = comment; |
|
comment = undefined; |
|
} |
|
mode = yield token; |
|
continue; |
|
} |
|
|
|
if (c == null) { |
|
throw new SyntaxError(`unexpected EOF at ${p}`); |
|
} else { |
|
throw new SyntaxError(`unexpected symbol "${c}" at ${p}`); |
|
} |
|
} |
|
} |
|
|
|
function parse(tokens) { |
|
let t = null; |
|
|
|
function unexpected() { |
|
return new ParseError(`unexpected token "${t.type}" at ${t.begin}`); |
|
} |
|
|
|
function expected(types) { |
|
return new ParseError( |
|
`unexpected token "${t.type}" at ${t.begin}, expected ${types |
|
.map((t) => `"${t}"`) |
|
.join(" or ")}` |
|
); |
|
} |
|
|
|
// no pushing tokens back, going only forward |
|
function next(mode) { |
|
t = tokens.next(mode).value; |
|
console.log("token", t && t.type); |
|
} |
|
|
|
const read = (types) => () => { |
|
const _t = t; |
|
if (types.includes(_t.type)) { |
|
console.log("parse", "read", types); |
|
next(); |
|
return _t; |
|
} |
|
throw expected(types); |
|
}; |
|
|
|
const maybeRead = (types) => () => { |
|
const _t = t; |
|
if (types.includes(_t.type)) { |
|
console.log("parse", "maybeRead", types); |
|
next(); |
|
return _t; |
|
} |
|
return null; |
|
}; |
|
|
|
const eof = read(["eof"]); |
|
|
|
const maybeId = maybeRead(["id"]); |
|
|
|
const subExpressionTokens = ["number", "id", "regexp"]; |
|
const maybeSubExpression = maybeRead(subExpressionTokens); |
|
const subExpression = read(subExpressionTokens); |
|
|
|
function tryMul(left) { |
|
const type = t.type; |
|
if (type != "*" && type != "/") return left; |
|
console.log("parse", "tryMul"); |
|
next("value"); |
|
const right = tryMul(subExpression()); |
|
|
|
return { |
|
type: "operator", |
|
operator: type, |
|
left, |
|
right, |
|
begin: left.begin, |
|
end: right.end, |
|
}; |
|
} |
|
|
|
function tryMulOrSum(left) { |
|
left = tryMul(left); |
|
if (t.type != "+") return left; |
|
console.log("parse", "tryMulOrSum"); |
|
next("value"); |
|
|
|
const right = tryMulOrSum(subExpression()); |
|
|
|
return { |
|
type: "operator", |
|
operator: "+", |
|
left, |
|
right, |
|
begin: left.begin, |
|
end: right.end, |
|
}; |
|
} |
|
|
|
function expression(left) { |
|
const body = tryMulOrSum(left); |
|
return { |
|
type: "expression", |
|
body, |
|
begin: body.begin, |
|
end: body.end, |
|
}; |
|
} |
|
|
|
function maybeAssignment(left) { |
|
if (t.type != "=") return null; |
|
console.log("parse", "maybeAssignment"); |
|
next("value"); |
|
|
|
const right = expression(subExpression()); |
|
|
|
return { |
|
type: "assignment", |
|
left, |
|
right, |
|
begin: left.begin, |
|
end: right.end, |
|
}; |
|
} |
|
|
|
function maybeAssignmentOrExpression() { |
|
const id = maybeId(); |
|
if (id) { |
|
const assignment = maybeAssignment(id); |
|
if (assignment) return assignment; |
|
} |
|
|
|
const left = id || maybeSubExpression(); |
|
if (!left) return null; |
|
|
|
return expression(left); |
|
} |
|
|
|
// or closing curly brace wihout next |
|
const endStatement = read([";", "eof"]); |
|
|
|
function maybeStatement() { |
|
if (t === undefined) return null; |
|
const body = maybeAssignmentOrExpression(); |
|
const end = endStatement(); |
|
|
|
return { |
|
type: "statement", |
|
body, |
|
begin: body ? body.begin : end.begin, |
|
end: end.end, |
|
}; |
|
} |
|
|
|
function program() { |
|
const body = []; |
|
let node; |
|
console.log("parse", "program"); |
|
next("value"); |
|
while ((node = maybeStatement())) { |
|
body.push(node); |
|
} |
|
|
|
return { |
|
type: "program", |
|
body, |
|
}; |
|
} |
|
|
|
const node = program(); |
|
|
|
if (t !== undefined) { |
|
throw new ParseError(`tokens left, next: "${t.type}"`); |
|
} |
|
return node; |
|
} |
|
|
|
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
|
|
|
console.log("\n\n"); |
|
|
|
const program = parse( |
|
lex(`foo / 1 + 2 * 3 + 4 ; bar = /rex1/ /* bla-bla */ / /rex2/`) |
|
); |
|
|
|
console.dir(program, { depth: null }); |
|
|
|
console.log("done"); |
Like parsers? See also the Ruby one: https://github.com/peter-leonov/ruby-parser.js