Skip to content

Instantly share code, notes, and snippets.

@pepasflo
Last active July 30, 2023 09:03
Show Gist options
  • Save pepasflo/4afa5813606b6ee73526a0d21d0d1035 to your computer and use it in GitHub Desktop.
Save pepasflo/4afa5813606b6ee73526a0d21d0d1035 to your computer and use it in GitHub Desktop.
A regex-based javascript lexer / scanner / tokenizer
#!/usr/bin/env node
var assert = require('assert');
// Each lexed token is a array of three integers:
// 1. the "token type": an index into the list of token patterns.
// 2. the index into the input string marking the start of this token.
// 3. the length of the token.
// The list of "token types" which our lexer understands:
var patterns = [
["wspace", /^\s+/],
["identifier", /^[a-zA-Z_]\w*/],
["oparen", /^\(/],
["cparen", /^\)/],
["float", /^-?\d+\.\d+/],
["int", /^-?\d+/],
["obrace", /^\{/],
["cbrace", /^\}/],
["obracket", /^\[/],
["cbracket", /^\]/],
// match two quotes containing any number of escaped-backslash, escaped quote, or any non-quote characters.
["string", /^"(\\\\|\\"|[^"])*"/],
["comment", /^\/\/.*/],
["other", /^[\s\S]/],
];
// finds the next token in the input string, starting at input[i], using the given token patterns.
// returns a token and an updated starting index, or causes assertion failure if it was unable to parse a token.
function read_token(input, i, patterns) {
// assert.ok(input.length > 0, "zero-length input");
// assert.ok(i < input.length, "starting index past end of input");
for (var j = 0; j < patterns.length; j++) {
var regex = patterns[j][1];
var result = input.slice(i).match(regex);
if (result !== null) {
var text = result[0];
var token = [j, i, text.length];
return [token, i + text.length];
}
}
assert.fail("exhausted input while lexing token");
}
// takes an input string and a list of token patterns and tokenizes the string.
// returns a list of tokens.
function tokenize(input, patterns) {
var tokens = [];
for (var i = 0; i < input.length;) {
var result = read_token(input, i, patterns);
var token = result[0];
i = result[1];
tokens.push(token);
// console.log("Debug: found token: ", token);
}
return tokens;
}
// Read from stdin
var fs = require('fs');
var input = fs.readFileSync(0).toString();
// Tokenize the input and track some benchmark stats
var then = Date.now();
var tokens = tokenize(input, patterns);
var now = Date.now();
var elapsed = now - then;
console.log("Lexed", tokens.length, "tokens in", elapsed, "ms (", Math.round(tokens.length/(elapsed/1000)), "tokens/sec)");
// Print out the lexed tokens (if less than 100 tokens)
if (tokens.length < 100) {
console.log("\nLexed tokens:");
for (var i = 0; i < tokens.length; i++) {
var token = tokens[i];
var token_type = patterns[token[0]][0];
var token_text = input.substr(token[1], token[2]);
console.log(token, token_type, token_text);
}
}
// Print out some token frequency stats (use this to optimize the order of the list of regex patterns)
// var stats = {};
// for (var i = 0; i < patterns.length; i++) {
// stats[i] = 0;
// }
// for (var i = 0; i < tokens.length; i++) {
// var token = tokens[i];
// stats[token[0]] += 1;
// }
// var stats2 = {};
// for (var i = 0; i < patterns.length; i++) {
// var label = patterns[i][0];
// stats2[label] = stats[i];
// }
// console.log(stats2);
# combine all C files in the linux kernel source as test input:
$ curl curl https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.19.tar.xz | unxz | tar x
$ cd linux-4.19
$ cat **/*.c > ../linux-4.19.c
# the result is a ~500k line C file:
$ wc -l linux-4.19.c
493065 linux-4.19.c
# benchmark the lexer:
$ cat linux-4.19.c | ./lexer.js
Lexed 3950697 tokens in 1785 ms ( 2213276 tokens/sec)
int main(void) { exit(0); }
(list 1 2 "A () 123 \\\"")
// this is a comment
some plain text
$ cat input.txt | ./lexer.js
Lexed 35 tokens in 0 ms ( Infinity tokens/sec)
Lexed tokens:
[ 1, 0, 3 ] 'identifier' 'int'
[ 0, 3, 1 ] 'wspace' ' '
[ 1, 4, 4 ] 'identifier' 'main'
[ 2, 8, 1 ] 'oparen' '('
[ 1, 9, 4 ] 'identifier' 'void'
[ 3, 13, 1 ] 'cparen' ')'
[ 0, 14, 1 ] 'wspace' ' '
[ 6, 15, 1 ] 'obrace' '{'
[ 0, 16, 1 ] 'wspace' ' '
[ 1, 17, 4 ] 'identifier' 'exit'
[ 2, 21, 1 ] 'oparen' '('
[ 5, 22, 1 ] 'int' '0'
[ 3, 23, 1 ] 'cparen' ')'
[ 12, 24, 1 ] 'other' ';'
[ 0, 25, 1 ] 'wspace' ' '
[ 7, 26, 1 ] 'cbrace' '}'
[ 0, 27, 1 ] 'wspace' '\n'
[ 2, 28, 1 ] 'oparen' '('
[ 1, 29, 4 ] 'identifier' 'list'
[ 0, 33, 1 ] 'wspace' ' '
[ 5, 34, 1 ] 'int' '1'
[ 0, 35, 1 ] 'wspace' ' '
[ 5, 36, 1 ] 'int' '2'
[ 0, 37, 1 ] 'wspace' ' '
[ 10, 38, 15 ] 'string' '"A () 123 \\\\\\""'
[ 3, 53, 1 ] 'cparen' ')'
[ 0, 54, 1 ] 'wspace' '\n'
[ 11, 55, 20 ] 'comment' '// this is a comment'
[ 0, 75, 1 ] 'wspace' '\n'
[ 1, 76, 4 ] 'identifier' 'some'
[ 0, 80, 1 ] 'wspace' ' '
[ 1, 81, 5 ] 'identifier' 'plain'
[ 0, 86, 1 ] 'wspace' ' '
[ 1, 87, 4 ] 'identifier' 'text'
[ 0, 91, 1 ] 'wspace' '\n'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment