Created
April 25, 2017 10:46
-
-
Save Heimdell/26652fd41cf588753306be84ec40b170 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// set :: string -> (char -> bool) | |
var set = (items) => { | |
var arr = Array.prototype.slice.apply(items) | |
var carrier = {} | |
arr.forEach(x => { carrier[x] = true }) | |
return (x) => carrier[x] | |
} | |
var reserved = set("module object end => let in \\ open ---".split(" ")) | |
var notReserved = (w) => !reserved(w) | |
var space = set(" \n\t") | |
var punctuation = set("{[(;,.:)]}") | |
var digit = set("0123456789") | |
var quote = set("'\"") | |
var nameChar = (c) => c && !set(" \n\t{[(;,.:)]}'\"")(c) | |
var nameStart = (c) => c && !set(" \n\t{[(;,.:)]}'\"0123456789")(c) | |
class Tokenizer { | |
constructor(runTokenizer) { | |
this.runTokenizer = runTokenizer | |
} | |
static of(value) { | |
return new Tokenizer(stream => ({stream, value})) | |
} | |
then(rest) { | |
return new Tokenizer(stream => { | |
var res = this.runTokenizer(stream) | |
return res.value | |
? rest(res.value).runTokenizer(res.stream) | |
: res | |
}) | |
} | |
or(other) { | |
return new Tokenizer(stream => { | |
var res = this.runTokenizer(stream) | |
return res.value | |
? res | |
: other.runTokenizer(stream) | |
}) | |
} | |
and_(other) { | |
return this.then(x => other.map(_ => x)) | |
} | |
_and(other) { | |
return this.then(_ => other) | |
} | |
map(f) { | |
return this.then(x => Tokenizer.of(f(x))) | |
} | |
many() { | |
return new Tokenizer(stream => { | |
var res, acc = [] | |
while ((res = this.runTokenizer(stream)).value) { | |
acc.push(res.value) | |
stream = res.stream | |
} | |
return {value: acc, stream} | |
}) | |
} | |
} | |
var int = new Tokenizer(({text, offset}) => { | |
var i | |
for (i = offset; text[i] && digit(text[i]); i++); | |
if (i == offset) | |
return {} | |
if (text[i] == '.') { | |
i++ | |
for (; text[i] && digit(text[i]); i++); | |
} | |
return {value: {text: text.slice(offset, i), type: "number", pos: offset}, stream: {text, offset: i}} | |
}) | |
var string = new Tokenizer(({text, offset}) => { | |
var quote = text[offset] | |
if (!quote || !(quote == "'" || quote == '"')) | |
return {} | |
var i | |
for (i = offset + 1; text[i] && text[i] != quote; i++); | |
i++; | |
return {value: {text: text.slice(offset, i), type: "string part", pos: offset}, stream: {text, offset: i}} | |
}) | |
var spaces = new Tokenizer(({text, offset}) => { | |
var i | |
for (i = offset; text[i] && space(text[i]); i++); | |
return {value: {text: "", type: "space", pos: offset}, stream: {text, offset: i}} | |
}) | |
var punct = new Tokenizer(({text, offset}) => { | |
if (!punctuation(text[offset])) | |
return {} | |
var i = offset + 1 | |
return ( | |
punctuation(text[offset]) | |
? {value: {text: text[offset], type: "punctuation", pos: offset}, stream: {text, offset: i}} | |
: {} | |
) | |
}) | |
var name = new Tokenizer(({text, offset}) => { | |
if (!text[offset] || !nameStart(text[offset])) | |
return {} | |
var i | |
for (i = offset + 1; text[i] && nameChar(text[i]); i++); | |
return {value: {text: text.slice(offset, i), type: "name", pos: offset}, stream: {text, offset: i}} | |
}) | |
var splitToTokens = (text) => { | |
var offset = 0 | |
var tokens = [] | |
while (offset < text.length) { | |
for (; space(text[offset]); offset++); | |
var i = offset | |
var here = text[offset] | |
if (digit(here)) { | |
for (i = offset; text[i] && digit(text[i]); i++); | |
if (text[i] == '.') { | |
i++ | |
for (; text[i] && digit(text[i]); i++); | |
} | |
tokens.push({ | |
text: text.slice(offset, i), | |
type: "number", | |
pos: offset | |
}) | |
} | |
if (quote(here)) { | |
for (i = offset + 1; text[i] && text[i] != here; i++); | |
i++; | |
tokens.push({ | |
text: text.slice(offset, i), | |
type: "string part", | |
pos: offset | |
}) | |
} | |
if (punctuation(here)) { | |
i++ | |
tokens.push({ | |
text: text.slice(offset, i), | |
type: "string part", | |
pos: offset | |
}) | |
} | |
if (nameStart(here)) { | |
for (i = offset + 1; text[i] && nameChar(text[i]); i++); | |
tokens.push({ | |
text: text.slice(offset, i), | |
type: "name", | |
pos: offset | |
}) | |
} | |
offset = i | |
} | |
return tokens | |
} | |
var tokens = spaces._and( | |
int.or(string).or(punct).or(name).and_(spaces).many() | |
) | |
var text = require('fs').readFileSync('RECENT-1M.json').toString() | |
// "fast" dense cycle first | |
var before = Date.now() | |
var toks = splitToTokens(text) | |
var d = Date.now() - before | |
console.log({bps: text.length / d * 1000, count: toks.length}) | |
// "slow" monad second | |
var before = Date.now() | |
var {value} = tokens.runTokenizer({text, offset: 0}) | |
var d = Date.now() - before | |
console.log({bps: text.length / d * 1000, count: value.length}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment