Forked from borgar/Tiny JavaScript tokenizer.js
Last active
September 16, 2019 20:51
-
-
Save klappy/7eb31af0c772031636b57e27fc9ad5e6 to your computer and use it in GitHub Desktop.
A compact tokenizer written in JavaScript.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Tiny tokenizer - https://gist.github.com/borgar/451393 | |
* @param {String} string - string to be tokenized | |
* @param {Object} parsers - { word:/\w+/, whitespace:/\s+/, punctuation:/[^\w\s]/ } | |
* @param {String} deftok - type to label tokens that are not classified with the above parsers | |
* @return {Array} - array of objects => [{ token:"this", type:"word" },{ token:" ", type:"whitespace" }, Object { token:"is", type:"word" }, ... ] | |
**/ | |
export const classifyTokens = (string, parsers, deftok) => { | |
string = (!string) ? '' : string; // if string is undefined, make it an empty string | |
if (typeof string !== 'string') { | |
throw new Error(`tokenizer.tokenize() string is not String: ${string}`); | |
} | |
let m; | |
let r; | |
let t; | |
let tokens = []; | |
while (string) { | |
t = null; | |
m = string.length; | |
let key; | |
for (key in parsers) { | |
if (Object.prototype.hasOwnProperty.call(parsers, key)) { | |
r = parsers[key].exec( string ); | |
// try to choose the best match if there are several | |
// where "best" is the closest to the current starting point | |
if ( r && ( r.index < m ) ) { | |
t = { | |
token: r[0], | |
type: key, | |
matches: r.slice(1), | |
}; | |
m = r.index; | |
} | |
} | |
} | |
if ( m ) { | |
// there is text between last token and currently | |
// matched token - push that out as default or "unknown" | |
tokens.push({ | |
token: string.substr( 0, m ), | |
type: deftok || 'unknown', | |
}); | |
} | |
if ( t ) { | |
// push current token onto sequence | |
tokens.push( t ); | |
} | |
string = string.substr( m + (t ? t.token.length : 0) ); | |
} | |
return tokens; | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment