Created
November 9, 2018 23:40
-
-
Save jameskeane/1634b02a8c8e6cfca60b72ce57c7b3c3 to your computer and use it in GitHub Desktop.
Learn you parsing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This code will teach you how to write a simple recursive descent parser. | |
// Corners will be cut, but by the end you will have the basis of most | |
// programming languages out there: strings, numbers, identifiers, and | |
// infix arithmetic expressions with precendence. | |
// At under 100 lines, I promise it won't take long. Happy Hacking! | |
// -- James Keane | |
// | |
// First the source code has to be tokenized, for the parser. | |
// Let's do five tokens, and ignore whitespace: | |
// 1. operators (+, -, *, etc.) | |
// 2. identifiers (variable_name, funcName, etc.) | |
// 3. numbers (1234, 3.1459, 0.0, etc) | |
// 4. strings ('hello', etc.) | |
// 5. control flow ({, }, ;, etc) | |
tokenize = (s) => (s === '') ? [] : ( | |
s.split(/\s*([-+*\/\%=\(\)]|[A-Za-z_][A-Za-z0-9_]*|[0-9]*\.?[0-9]+|\'[^\']*\'|[\;\}])\s*/g) | |
.filter((s) => !s.match(/^\s*$/)) | |
); | |
// A better tokenizer would keep track of line and column positions, stream | |
// symbols or objects with meaning so that we wouldn't need to repeat ourselves. | |
// | |
// Now let's repeat ourselves and start building the parser rules bottom up. | |
let t = [], i = 0; | |
// Starting simple, we'll match basic tokens that are direct AST values: | |
string = () => /^\'[^\']*\'$/.test(t[i]) ? | |
{ type: 'string', value: t[i].slice(1, t[i++].length - 1) } : paren(); | |
number = () => /^[0-9]*(\.[0-9]+)?$/.test(t[i]) ? | |
{ type: 'number', value: parseInt(t[i++], 10) } : string(); | |
identifier = () => /^[a-zA-Z_][a-zA-Z_0-9]*$/.test(t[i]) ? | |
{ type: 'identifier', ident: t[i++] } : number(); | |
// Notice how each rule tries to match it's corresponding token regex, and if | |
// it fails it calls the 'up' rule? That's recursive descent; we're just writing | |
// it 'upside down' because it's easier that way. | |
// In the `string` rule, you might've noticed that it's calling into a missing | |
// symbol `paren`? Remember that, it's important; it will be defined later. | |
// | |
// The basic idea behind recursive descent is to parse the source top down, i.e. | |
// we have a `program` => `statement`[] => `ifstmt` | `funcdef` | (`expression` + ';') | |
// etc. | |
// Let continue defining our expressions, with some mathematical operators. | |
// First though let's make a small helper since they all work pretty much the | |
// same way. Inside the helper, there is a subtle trick that handles operator | |
// precendence: the lhs calls down, but the rhs calls it self. | |
// This is the kind of flexibility that you just don't get with a parser | |
// generator. | |
op = (sym, lhs) => () => { | |
const l = lhs(); | |
return (t[i] !== sym) ? l : | |
{ type: 'operator', lhs: l, op: t[i++], rhs: op(sym, lhs)() }; | |
}; | |
// now for the operators, in (reverse) precedence order | |
divide = op('/', identifier); | |
modulo = op('%', divide) | |
multiply = op('*', modulo); | |
minus = op('-', multiply); | |
add = op('+', minus); | |
// Let's stop for a second and walk through how we just did that. Consider | |
// `a * 3 + b` and think about what will happen if we call `add`. Don't worry if | |
// it takes a while to think it through, this is the hardest part. | |
// | |
// The final piece is `paren`, it ties it up from the beginning! and allows for | |
// expressions to be wrapped by parentheses to control precedence | |
expression = add; | |
paren = () => { | |
if (t[i] !== '(') return null; | |
i++; | |
const next = expression(); | |
if (t[i++] !== ')') throw Error(`Unclosed parentheses at position ${i}.`); | |
return next; | |
}; | |
// all that's left is the parser entrypoint | |
module.exports = parse = (source) => { | |
i = 0, t = tokenize(source); | |
return expression(); | |
}; | |
// proof is in the pudding | |
console.log( parse(`a * 3 + b`) ); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment