Last active
July 30, 2023 09:03
-
-
Save pepasflo/4afa5813606b6ee73526a0d21d0d1035 to your computer and use it in GitHub Desktop.
A regex-based javascript lexer / scanner / tokenizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env node | |
var assert = require('assert'); | |
// Each lexed token is a array of three integers: | |
// 1. the "token type": an index into the list of token patterns. | |
// 2. the index into the input string marking the start of this token. | |
// 3. the length of the token. | |
// The list of "token types" which our lexer understands: | |
var patterns = [ | |
["wspace", /^\s+/], | |
["identifier", /^[a-zA-Z_]\w*/], | |
["oparen", /^\(/], | |
["cparen", /^\)/], | |
["float", /^-?\d+\.\d+/], | |
["int", /^-?\d+/], | |
["obrace", /^\{/], | |
["cbrace", /^\}/], | |
["obracket", /^\[/], | |
["cbracket", /^\]/], | |
// match two quotes containing any number of escaped-backslash, escaped quote, or any non-quote characters. | |
["string", /^"(\\\\|\\"|[^"])*"/], | |
["comment", /^\/\/.*/], | |
["other", /^[\s\S]/], | |
]; | |
// finds the next token in the input string, starting at input[i], using the given token patterns. | |
// returns a token and an updated starting index, or causes assertion failure if it was unable to parse a token. | |
function read_token(input, i, patterns) { | |
// assert.ok(input.length > 0, "zero-length input"); | |
// assert.ok(i < input.length, "starting index past end of input"); | |
for (var j = 0; j < patterns.length; j++) { | |
var regex = patterns[j][1]; | |
var result = input.slice(i).match(regex); | |
if (result !== null) { | |
var text = result[0]; | |
var token = [j, i, text.length]; | |
return [token, i + text.length]; | |
} | |
} | |
assert.fail("exhausted input while lexing token"); | |
} | |
// takes an input string and a list of token patterns and tokenizes the string. | |
// returns a list of tokens. | |
function tokenize(input, patterns) { | |
var tokens = []; | |
for (var i = 0; i < input.length;) { | |
var result = read_token(input, i, patterns); | |
var token = result[0]; | |
i = result[1]; | |
tokens.push(token); | |
// console.log("Debug: found token: ", token); | |
} | |
return tokens; | |
} | |
// Read from stdin | |
var fs = require('fs'); | |
var input = fs.readFileSync(0).toString(); | |
// Tokenize the input and track some benchmark stats | |
var then = Date.now(); | |
var tokens = tokenize(input, patterns); | |
var now = Date.now(); | |
var elapsed = now - then; | |
console.log("Lexed", tokens.length, "tokens in", elapsed, "ms (", Math.round(tokens.length/(elapsed/1000)), "tokens/sec)"); | |
// Print out the lexed tokens (if less than 100 tokens) | |
if (tokens.length < 100) { | |
console.log("\nLexed tokens:"); | |
for (var i = 0; i < tokens.length; i++) { | |
var token = tokens[i]; | |
var token_type = patterns[token[0]][0]; | |
var token_text = input.substr(token[1], token[2]); | |
console.log(token, token_type, token_text); | |
} | |
} | |
// Print out some token frequency stats (use this to optimize the order of the list of regex patterns) | |
// var stats = {}; | |
// for (var i = 0; i < patterns.length; i++) { | |
// stats[i] = 0; | |
// } | |
// for (var i = 0; i < tokens.length; i++) { | |
// var token = tokens[i]; | |
// stats[token[0]] += 1; | |
// } | |
// var stats2 = {}; | |
// for (var i = 0; i < patterns.length; i++) { | |
// var label = patterns[i][0]; | |
// stats2[label] = stats[i]; | |
// } | |
// console.log(stats2); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# combine all C files in the linux kernel source as test input: | |
$ curl curl https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.19.tar.xz | unxz | tar x | |
$ cd linux-4.19 | |
$ cat **/*.c > ../linux-4.19.c | |
# the result is a ~500k line C file: | |
$ wc -l linux-4.19.c | |
493065 linux-4.19.c | |
# benchmark the lexer: | |
$ cat linux-4.19.c | ./lexer.js | |
Lexed 3950697 tokens in 1785 ms ( 2213276 tokens/sec) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
int main(void) { exit(0); } | |
(list 1 2 "A () 123 \\\"") | |
// this is a comment | |
some plain text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ cat input.txt | ./lexer.js | |
Lexed 35 tokens in 0 ms ( Infinity tokens/sec) | |
Lexed tokens: | |
[ 1, 0, 3 ] 'identifier' 'int' | |
[ 0, 3, 1 ] 'wspace' ' ' | |
[ 1, 4, 4 ] 'identifier' 'main' | |
[ 2, 8, 1 ] 'oparen' '(' | |
[ 1, 9, 4 ] 'identifier' 'void' | |
[ 3, 13, 1 ] 'cparen' ')' | |
[ 0, 14, 1 ] 'wspace' ' ' | |
[ 6, 15, 1 ] 'obrace' '{' | |
[ 0, 16, 1 ] 'wspace' ' ' | |
[ 1, 17, 4 ] 'identifier' 'exit' | |
[ 2, 21, 1 ] 'oparen' '(' | |
[ 5, 22, 1 ] 'int' '0' | |
[ 3, 23, 1 ] 'cparen' ')' | |
[ 12, 24, 1 ] 'other' ';' | |
[ 0, 25, 1 ] 'wspace' ' ' | |
[ 7, 26, 1 ] 'cbrace' '}' | |
[ 0, 27, 1 ] 'wspace' '\n' | |
[ 2, 28, 1 ] 'oparen' '(' | |
[ 1, 29, 4 ] 'identifier' 'list' | |
[ 0, 33, 1 ] 'wspace' ' ' | |
[ 5, 34, 1 ] 'int' '1' | |
[ 0, 35, 1 ] 'wspace' ' ' | |
[ 5, 36, 1 ] 'int' '2' | |
[ 0, 37, 1 ] 'wspace' ' ' | |
[ 10, 38, 15 ] 'string' '"A () 123 \\\\\\""' | |
[ 3, 53, 1 ] 'cparen' ')' | |
[ 0, 54, 1 ] 'wspace' '\n' | |
[ 11, 55, 20 ] 'comment' '// this is a comment' | |
[ 0, 75, 1 ] 'wspace' '\n' | |
[ 1, 76, 4 ] 'identifier' 'some' | |
[ 0, 80, 1 ] 'wspace' ' ' | |
[ 1, 81, 5 ] 'identifier' 'plain' | |
[ 0, 86, 1 ] 'wspace' ' ' | |
[ 1, 87, 4 ] 'identifier' 'text' | |
[ 0, 91, 1 ] 'wspace' '\n' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment