pepasflo · July 30, 2023 09:03
diff --git a/lexer.js b/lexer.js
 #!/usr/bin/env node

 var assert = require('assert');

 // Each lexed token is a array of three integers:
 // 1. the "token type": an index into the list of token patterns.
 // 2. the index into the input string marking the start of this token.
 // 3. the length of the token.

 // The list of "token types" which our lexer understands:
 var patterns = [
    ["wspace", /^\s+/],
    ["identifier", /^[a-zA-Z_]\w*/],
    ["oparen", /^\(/],
    ["cparen", /^\)/],
    ["float", /^-?\d+\.\d+/],
    ["int", /^-?\d+/],
    ["obrace", /^\{/],
    ["cbrace", /^\}/],
    ["obracket", /^\[/],
    ["cbracket", /^\]/],
    // match two quotes containing any number of escaped-backslash, escaped quote, or any non-quote characters.
    ["string", /^"(\\\\|\\"|[^"])*"/],
    ["comment", /^\/\/.*/],
    ["other", /^[\s\S]/],
 ];

 // finds the next token in the input string, starting at input[i], using the given token patterns.
 // returns a token and an updated starting index, or causes assertion failure if it was unable to parse a token.
 function read_token(input, i, patterns) {
    // assert.ok(input.length > 0, "zero-length input");
    // assert.ok(i < input.length, "starting index past end of input");
    for (var j = 0; j < patterns.length; j++) {
        var regex = patterns[j][1];
        var result = input.slice(i).match(regex);
        if (result !== null) {
            var text = result[0];
            var token = [j, i, text.length];
            return [token, i + text.length];
        }
    }
    assert.fail("exhausted input while lexing token");
 }

 // takes an input string and a list of token patterns and tokenizes the string.
 // returns a list of tokens.
 function tokenize(input, patterns) {
    var tokens = [];
    for (var i = 0; i < input.length;) {
        var result = read_token(input, i, patterns);
        var token = result[0];
        i = result[1];
        tokens.push(token);
        // console.log("Debug: found token: ", token);
    }
    return tokens;
 }

 // Read from stdin
 var fs = require('fs');
 var input = fs.readFileSync(0).toString();

 // Tokenize the input and track some benchmark stats
 var then = Date.now();
 var tokens = tokenize(input, patterns);
 var now = Date.now();
 var elapsed = now - then;
 console.log("Lexed", tokens.length, "tokens in", elapsed, "ms (", Math.round(tokens.length/(elapsed/1000)), "tokens/sec)");

 // Print out the lexed tokens (if less than 100 tokens)
 if (tokens.length < 100) {
    console.log("\nLexed tokens:");
    for (var i = 0; i < tokens.length; i++) {
        var token = tokens[i];
        var token_type = patterns[token[0]][0];
        var token_text = input.substr(token[1], token[2]);
        console.log(token, token_type, token_text);
    }
 }

 // Print out some token frequency stats (use this to optimize the order of the list of regex patterns)
 // var stats = {};
 // for (var i = 0; i < patterns.length; i++) {
 //     stats[i] = 0;
 // }
 // for (var i = 0; i < tokens.length; i++) {
 //     var token = tokens[i];
 //     stats[token[0]] += 1;
 // }
 // var stats2 = {};
 // for (var i = 0; i < patterns.length; i++) {
 //     var label = patterns[i][0];
 //     stats2[label] = stats[i];
 // }
 // console.log(stats2);
diff --git a/sample-benchmark.txt b/sample-benchmark.txt
 # combine all C files in the linux kernel source as test input:
 $ curl curl https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.19.tar.xz | unxz | tar x
 $ cd linux-4.19
 $ cat **/*.c > ../linux-4.19.c

 # the result is a ~500k line C file:
 $ wc -l linux-4.19.c 
  493065 linux-4.19.c

 # benchmark the lexer:
 $ cat linux-4.19.c | ./lexer.js 
 Lexed 3950697 tokens in 1785 ms ( 2213276 tokens/sec)
diff --git a/sample-input.txt b/sample-input.txt
 int main(void) { exit(0); }
 (list 1 2 "A () 123 \\\"")
 // this is a comment
 some plain text
diff --git a/sample-output.txt b/sample-output.txt
 $ cat input.txt | ./lexer.js 
 Lexed 35 tokens in 0 ms ( Infinity tokens/sec)

 Lexed tokens:
 [ 1, 0, 3 ] 'identifier' 'int'
 [ 0, 3, 1 ] 'wspace' ' '
 [ 1, 4, 4 ] 'identifier' 'main'
 [ 2, 8, 1 ] 'oparen' '('
 [ 1, 9, 4 ] 'identifier' 'void'
 [ 3, 13, 1 ] 'cparen' ')'
 [ 0, 14, 1 ] 'wspace' ' '
 [ 6, 15, 1 ] 'obrace' '{'
 [ 0, 16, 1 ] 'wspace' ' '
 [ 1, 17, 4 ] 'identifier' 'exit'
 [ 2, 21, 1 ] 'oparen' '('
 [ 5, 22, 1 ] 'int' '0'
 [ 3, 23, 1 ] 'cparen' ')'
 [ 12, 24, 1 ] 'other' ';'
 [ 0, 25, 1 ] 'wspace' ' '
 [ 7, 26, 1 ] 'cbrace' '}'
 [ 0, 27, 1 ] 'wspace' '\n'
 [ 2, 28, 1 ] 'oparen' '('
 [ 1, 29, 4 ] 'identifier' 'list'
 [ 0, 33, 1 ] 'wspace' ' '
 [ 5, 34, 1 ] 'int' '1'
 [ 0, 35, 1 ] 'wspace' ' '
 [ 5, 36, 1 ] 'int' '2'
 [ 0, 37, 1 ] 'wspace' ' '
 [ 10, 38, 15 ] 'string' '"A () 123 \\\\\\""'
 [ 3, 53, 1 ] 'cparen' ')'
 [ 0, 54, 1 ] 'wspace' '\n'
 [ 11, 55, 20 ] 'comment' '// this is a comment'
 [ 0, 75, 1 ] 'wspace' '\n'
 [ 1, 76, 4 ] 'identifier' 'some'
 [ 0, 80, 1 ] 'wspace' ' '
 [ 1, 81, 5 ] 'identifier' 'plain'
 [ 0, 86, 1 ] 'wspace' ' '
 [ 1, 87, 4 ] 'identifier' 'text'
 [ 0, 91, 1 ] 'wspace' '\n'
	#!/usr/bin/env node

	var assert = require('assert');

	// Each lexed token is a array of three integers:
	// 1. the "token type": an index into the list of token patterns.
	// 2. the index into the input string marking the start of this token.
	// 3. the length of the token.

	// The list of "token types" which our lexer understands:
	var patterns = [
	["wspace", /^\s+/],
	["identifier", /^[a-zA-Z_]\w*/],
	["oparen", /^\(/],
	["cparen", /^\)/],
	["float", /^-?\d+\.\d+/],
	["int", /^-?\d+/],
	["obrace", /^\{/],
	["cbrace", /^\}/],
	["obracket", /^\[/],
	["cbracket", /^\]/],
	// match two quotes containing any number of escaped-backslash, escaped quote, or any non-quote characters.
	["string", /^"(\\\\\|\\"\|[^"])*"/],
	["comment", /^\/\/.*/],
	["other", /^[\s\S]/],
	];

	// finds the next token in the input string, starting at input[i], using the given token patterns.
	// returns a token and an updated starting index, or causes assertion failure if it was unable to parse a token.
	function read_token(input, i, patterns) {
	// assert.ok(input.length > 0, "zero-length input");
	// assert.ok(i < input.length, "starting index past end of input");
	for (var j = 0; j < patterns.length; j++) {
	var regex = patterns[j][1];
	var result = input.slice(i).match(regex);
	if (result !== null) {
	var text = result[0];
	var token = [j, i, text.length];
	return [token, i + text.length];
	}
	}
	assert.fail("exhausted input while lexing token");
	}

	// takes an input string and a list of token patterns and tokenizes the string.
	// returns a list of tokens.
	function tokenize(input, patterns) {
	var tokens = [];
	for (var i = 0; i < input.length;) {
	var result = read_token(input, i, patterns);
	var token = result[0];
	i = result[1];
	tokens.push(token);
	// console.log("Debug: found token: ", token);
	}
	return tokens;
	}

	// Read from stdin
	var fs = require('fs');
	var input = fs.readFileSync(0).toString();

	// Tokenize the input and track some benchmark stats
	var then = Date.now();
	var tokens = tokenize(input, patterns);
	var now = Date.now();
	var elapsed = now - then;
	console.log("Lexed", tokens.length, "tokens in", elapsed, "ms (", Math.round(tokens.length/(elapsed/1000)), "tokens/sec)");

	// Print out the lexed tokens (if less than 100 tokens)
	if (tokens.length < 100) {
	console.log("\nLexed tokens:");
	for (var i = 0; i < tokens.length; i++) {
	var token = tokens[i];
	var token_type = patterns[token[0]][0];
	var token_text = input.substr(token[1], token[2]);
	console.log(token, token_type, token_text);
	}
	}

	// Print out some token frequency stats (use this to optimize the order of the list of regex patterns)
	// var stats = {};
	// for (var i = 0; i < patterns.length; i++) {
	// stats[i] = 0;
	// }
	// for (var i = 0; i < tokens.length; i++) {
	// var token = tokens[i];
	// stats[token[0]] += 1;
	// }
	// var stats2 = {};
	// for (var i = 0; i < patterns.length; i++) {
	// var label = patterns[i][0];
	// stats2[label] = stats[i];
	// }
	// console.log(stats2);
	# combine all C files in the linux kernel source as test input:
	$ curl curl https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.19.tar.xz \| unxz \| tar x
	$ cd linux-4.19
	$ cat */.c > ../linux-4.19.c

	# the result is a ~500k line C file:
	$ wc -l linux-4.19.c
	493065 linux-4.19.c

	# benchmark the lexer:
	$ cat linux-4.19.c \| ./lexer.js
	Lexed 3950697 tokens in 1785 ms ( 2213276 tokens/sec)
	int main(void) { exit(0); }
	(list 1 2 "A () 123 \\\"")
	// this is a comment
	some plain text
	$ cat input.txt \| ./lexer.js
	Lexed 35 tokens in 0 ms ( Infinity tokens/sec)

	Lexed tokens:
	[ 1, 0, 3 ] 'identifier' 'int'
	[ 0, 3, 1 ] 'wspace' ' '
	[ 1, 4, 4 ] 'identifier' 'main'
	[ 2, 8, 1 ] 'oparen' '('
	[ 1, 9, 4 ] 'identifier' 'void'
	[ 3, 13, 1 ] 'cparen' ')'
	[ 0, 14, 1 ] 'wspace' ' '
	[ 6, 15, 1 ] 'obrace' '{'
	[ 0, 16, 1 ] 'wspace' ' '
	[ 1, 17, 4 ] 'identifier' 'exit'
	[ 2, 21, 1 ] 'oparen' '('
	[ 5, 22, 1 ] 'int' '0'
	[ 3, 23, 1 ] 'cparen' ')'
	[ 12, 24, 1 ] 'other' ';'
	[ 0, 25, 1 ] 'wspace' ' '
	[ 7, 26, 1 ] 'cbrace' '}'
	[ 0, 27, 1 ] 'wspace' '\n'
	[ 2, 28, 1 ] 'oparen' '('
	[ 1, 29, 4 ] 'identifier' 'list'
	[ 0, 33, 1 ] 'wspace' ' '
	[ 5, 34, 1 ] 'int' '1'
	[ 0, 35, 1 ] 'wspace' ' '
	[ 5, 36, 1 ] 'int' '2'
	[ 0, 37, 1 ] 'wspace' ' '
	[ 10, 38, 15 ] 'string' '"A () 123 \\\\\\""'
	[ 3, 53, 1 ] 'cparen' ')'
	[ 0, 54, 1 ] 'wspace' '\n'
	[ 11, 55, 20 ] 'comment' '// this is a comment'
	[ 0, 75, 1 ] 'wspace' '\n'
	[ 1, 76, 4 ] 'identifier' 'some'
	[ 0, 80, 1 ] 'wspace' ' '
	[ 1, 81, 5 ] 'identifier' 'plain'
	[ 0, 86, 1 ] 'wspace' ' '
	[ 1, 87, 4 ] 'identifier' 'text'
	[ 0, 91, 1 ] 'wspace' '\n'