Last active
December 20, 2015 08:39
-
-
Save zheplusplus/6102238 to your computer and use it in GitHub Desktop.
Automation based tokenizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var tknz = typeof require === 'undefined' ? window.tokenizer | |
: require('./tokenizer'); | |
var t = tknz.Tokenizer(); | |
t | |
.simpleSymbols('=+-*/%<>!.', 'operator') | |
.simpleSymbols('(', 'op_paren') | |
.simpleSymbols(')', 'cl_paren') | |
.simpleSymbols('{', 'op_brace') | |
.simpleSymbols('}', 'cl_brace') | |
.simpleSymbols(';', 'semicolon') | |
.ignore(' \t\r') | |
.loop(tknz.DIGITS) | |
.accept('integer') | |
.startWith(tknz.LETTERS) | |
.loop(tknz.DIGITS + tknz.LETTERS) | |
.accept('identifier') | |
.fixed('if') | |
.fixed('for') | |
.fixed('<=', 'operator') | |
.fixed('>=', 'operator') | |
.fixed('==', 'operator') | |
.fixed('!=', 'operator') | |
; | |
console.log(t.tokenize('for (i = 0; i < 10; i = i + 1) { if (i % 3 == 0) { console.log(i); } }')); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-function(exports) { | |
function each(string, cb) { | |
for (var i = 0; i < string.length; ++i) { | |
cb(string[i]); | |
} | |
} | |
exports.DIGITS_EXC_0 = '123456789', | |
exports.DIGITS = exports.DIGITS_EXC_0 + '0', | |
exports.LOWERCASES = 'qwertyuiopasdfghjklzxcvbnm'; | |
exports.UPPERCASES = exports.LOWERCASES.toUpperCase(); | |
exports.LETTERS = exports.LOWERCASES + exports.UPPERCASES; | |
exports.Tokenizer = function() { | |
var entryState = {}; | |
var ignoreState = { | |
_ignore: true | |
}; | |
var stateTrace = []; | |
function started() { | |
return 0 < stateTrace.length; | |
} | |
function currentState() { | |
if (!started()) { | |
throw 'not started'; | |
} | |
return stateTrace[stateTrace.length - 1]; | |
} | |
return { | |
simpleSymbols: function(symbols, name) { | |
var symbolState = { | |
_type: name || 'symbol' | |
}; | |
each(symbols, function(ch) { | |
if (entryState[ch]) { | |
throw 'duplicate entry: ' + ch; | |
} | |
entryState[ch] = symbolState; | |
}); | |
return this; | |
}, | |
startWith: function(next) { | |
if (started()) { | |
throw 'already started'; | |
} | |
stateTrace.push({}); | |
each(next, function(ch) { | |
if (entryState[ch]) { | |
throw 'duplicate entry: ' + ch; | |
} | |
entryState[ch] = currentState(); | |
}); | |
return this; | |
}, | |
fixed: function(image, type) { | |
if (!image) { | |
throw 'empty fixed token'; | |
} | |
function cloneState(s) { | |
var clone = {}, key; | |
if (s) { | |
for (key in s) { | |
clone[key] = s[key]; | |
} | |
} | |
return clone; | |
} | |
var passState = entryState; | |
var lastChar = image[image.length - 1]; | |
type = type || image; | |
each(image.substr(0, image.length - 1), function(ch) { | |
var shadowState = cloneState(passState[ch]); | |
passState[ch] = shadowState; | |
passState = shadowState; | |
}); | |
var shadowState = cloneState(passState[lastChar]); | |
shadowState._type = type; | |
passState[lastChar] = shadowState; | |
return this; | |
}, | |
loop: function(next) { | |
if (!started()) { | |
this.startWith(next); | |
} | |
each(next, function(ch) { | |
currentState()[ch] = currentState(); | |
}); | |
return this; | |
}, | |
ignore: function(next) { | |
each(next, function(ch) { | |
entryState[ch] = ignoreState; | |
ignoreState[ch] = ignoreState; | |
}); | |
return this; | |
}, | |
accept: function(type) { | |
currentState()._type = type; | |
stateTrace = []; | |
return this; | |
}, | |
tokenize: function(input) { | |
var state = entryState; | |
var token = []; | |
var result = []; | |
var me = this; | |
function resetConsume(ch) { | |
token = []; | |
state = entryState; | |
nextChar(ch); | |
} | |
function nextChar(ch) { | |
if (state[ch]) { | |
state = state[ch]; | |
if (!state._ignore) { | |
token.push(ch); | |
} | |
return; | |
} | |
if (state._type) { | |
result.push({ | |
token: token.join(''), | |
type: state._type | |
}); | |
return resetConsume(ch); | |
} | |
if (state._ignore) { | |
return resetConsume(ch); | |
} | |
me.error('unexpected character'); | |
} | |
each(input, nextChar); | |
if (token.length !== 0 && state) { | |
result.push({ | |
token: token.join(''), | |
type: state._type | |
}); | |
} | |
return result; | |
}, | |
error: function(error) { | |
throw error; | |
} | |
}; | |
}; | |
}(typeof exports === 'undefined' ? window.tokenizer = {} : exports); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment