Skip to content

Instantly share code, notes, and snippets.

@Heimdell
Created April 25, 2017 10:46
Show Gist options
  • Save Heimdell/26652fd41cf588753306be84ec40b170 to your computer and use it in GitHub Desktop.
Save Heimdell/26652fd41cf588753306be84ec40b170 to your computer and use it in GitHub Desktop.
// set :: string -> (char -> bool)
var set = (items) => {
var arr = Array.prototype.slice.apply(items)
var carrier = {}
arr.forEach(x => { carrier[x] = true })
return (x) => carrier[x]
}
var reserved = set("module object end => let in \\ open ---".split(" "))
var notReserved = (w) => !reserved(w)
var space = set(" \n\t")
var punctuation = set("{[(;,.:)]}")
var digit = set("0123456789")
var quote = set("'\"")
var nameChar = (c) => c && !set(" \n\t{[(;,.:)]}'\"")(c)
var nameStart = (c) => c && !set(" \n\t{[(;,.:)]}'\"0123456789")(c)
class Tokenizer {
constructor(runTokenizer) {
this.runTokenizer = runTokenizer
}
static of(value) {
return new Tokenizer(stream => ({stream, value}))
}
then(rest) {
return new Tokenizer(stream => {
var res = this.runTokenizer(stream)
return res.value
? rest(res.value).runTokenizer(res.stream)
: res
})
}
or(other) {
return new Tokenizer(stream => {
var res = this.runTokenizer(stream)
return res.value
? res
: other.runTokenizer(stream)
})
}
and_(other) {
return this.then(x => other.map(_ => x))
}
_and(other) {
return this.then(_ => other)
}
map(f) {
return this.then(x => Tokenizer.of(f(x)))
}
many() {
return new Tokenizer(stream => {
var res, acc = []
while ((res = this.runTokenizer(stream)).value) {
acc.push(res.value)
stream = res.stream
}
return {value: acc, stream}
})
}
}
var int = new Tokenizer(({text, offset}) => {
var i
for (i = offset; text[i] && digit(text[i]); i++);
if (i == offset)
return {}
if (text[i] == '.') {
i++
for (; text[i] && digit(text[i]); i++);
}
return {value: {text: text.slice(offset, i), type: "number", pos: offset}, stream: {text, offset: i}}
})
var string = new Tokenizer(({text, offset}) => {
var quote = text[offset]
if (!quote || !(quote == "'" || quote == '"'))
return {}
var i
for (i = offset + 1; text[i] && text[i] != quote; i++);
i++;
return {value: {text: text.slice(offset, i), type: "string part", pos: offset}, stream: {text, offset: i}}
})
var spaces = new Tokenizer(({text, offset}) => {
var i
for (i = offset; text[i] && space(text[i]); i++);
return {value: {text: "", type: "space", pos: offset}, stream: {text, offset: i}}
})
var punct = new Tokenizer(({text, offset}) => {
if (!punctuation(text[offset]))
return {}
var i = offset + 1
return (
punctuation(text[offset])
? {value: {text: text[offset], type: "punctuation", pos: offset}, stream: {text, offset: i}}
: {}
)
})
var name = new Tokenizer(({text, offset}) => {
if (!text[offset] || !nameStart(text[offset]))
return {}
var i
for (i = offset + 1; text[i] && nameChar(text[i]); i++);
return {value: {text: text.slice(offset, i), type: "name", pos: offset}, stream: {text, offset: i}}
})
var splitToTokens = (text) => {
var offset = 0
var tokens = []
while (offset < text.length) {
for (; space(text[offset]); offset++);
var i = offset
var here = text[offset]
if (digit(here)) {
for (i = offset; text[i] && digit(text[i]); i++);
if (text[i] == '.') {
i++
for (; text[i] && digit(text[i]); i++);
}
tokens.push({
text: text.slice(offset, i),
type: "number",
pos: offset
})
}
if (quote(here)) {
for (i = offset + 1; text[i] && text[i] != here; i++);
i++;
tokens.push({
text: text.slice(offset, i),
type: "string part",
pos: offset
})
}
if (punctuation(here)) {
i++
tokens.push({
text: text.slice(offset, i),
type: "string part",
pos: offset
})
}
if (nameStart(here)) {
for (i = offset + 1; text[i] && nameChar(text[i]); i++);
tokens.push({
text: text.slice(offset, i),
type: "name",
pos: offset
})
}
offset = i
}
return tokens
}
var tokens = spaces._and(
int.or(string).or(punct).or(name).and_(spaces).many()
)
var text = require('fs').readFileSync('RECENT-1M.json').toString()
// "fast" dense cycle first
var before = Date.now()
var toks = splitToTokens(text)
var d = Date.now() - before
console.log({bps: text.length / d * 1000, count: toks.length})
// "slow" monad second
var before = Date.now()
var {value} = tokens.runTokenizer({text, offset: 0})
var d = Date.now() - before
console.log({bps: text.length / d * 1000, count: value.length})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment