-
-
Save HamedFathi/a6598422ac2755b3221a9604c2c67432 to your computer and use it in GitHub Desktop.
Small, simple, JS tokenizer in JS.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Simple JavaScript tokenizer (not a full parser!!!) | |
* | |
* Portions taken from Narcissus by Brendan Eich <[email protected]>. | |
*/ | |
/*jshint evil: true, regexdash: false, regexp: false */ | |
var KEYWORDS = [ | |
'break', | |
'case', 'catch', 'const', 'continue', | |
'debugger', 'default', 'delete', 'do', | |
'else', 'enum', | |
'false', 'finally', 'for', 'function', | |
'if', 'in', 'instanceof', | |
'new', 'null', | |
'return', | |
'switch', | |
'this', 'throw', 'true', 'try', 'typeof', | |
'var', 'void', | |
'while', 'with' | |
]; | |
var OPERATORS = { | |
'>>>=': 'ASSIGN_URSH', | |
'>>=': 'ASSIGN_RSH', | |
'<<=': 'ASSIGN_LSH', | |
'|=': 'ASSIGN_BITWISE_OR', | |
'^=': 'ASSIGN_BITWISE_XOR', | |
'&=': 'ASSIGN_BITWISE_AND', | |
'+=': 'ASSIGN_PLUS', | |
'-=': 'ASSIGN_MINUS', | |
'*=': 'ASSIGN_MUL', | |
'/=': 'ASSIGN_DIV', | |
'%=': 'ASSIGN_MOD', | |
';': 'SEMICOLON', | |
',': 'COMMA', | |
'?': 'HOOK', | |
':': 'COLON', | |
'||': 'OR', | |
'&&': 'AND', | |
'|': 'BITWISE_OR', | |
'^': 'BITWISE_XOR', | |
'&': 'BITWISE_AND', | |
'===': 'STRICT_EQ', | |
'==': 'EQ', | |
'=': 'ASSIGN', | |
'!==': 'STRICT_NE', | |
'!=': 'NE', | |
'<<': 'LSH', | |
'<=': 'LE', | |
'<': 'LT', | |
'>>>': 'URSH', | |
'>>': 'RSH', | |
'>=': 'GE', | |
'>': 'GT', | |
'++': 'INCREMENT', | |
'--': 'DECREMENT', | |
'+': 'PLUS', | |
'-': 'MINUS', | |
'*': 'MUL', | |
'/': 'DIV', | |
'%': 'MOD', | |
'!': 'NOT', | |
'~': 'BITWISE_NOT', | |
'.': 'DOT', | |
'[': 'LEFT_BRACKET', | |
']': 'RIGHT_BRACKET', | |
'{': 'LEFT_CURLY', | |
'}': 'RIGHT_CURLY', | |
'(': 'LEFT_PAREN', | |
')': 'RIGHT_PAREN' | |
}; | |
// Regular Expressions -------------------------------------------------------- | |
// ---------------------------------------------------------------------------- | |
var opMatch = '^'; | |
for (var i in OPERATORS) { | |
if (opMatch !== '^') { | |
opMatch += '|^'; | |
} | |
opMatch += i.replace(/[?|^&(){}\[\]+\-*\/\.]/g, '\\$&'); | |
} | |
var opRegExp = new RegExp(opMatch), | |
fpRegExp = /^\d+\.\d*(?:[eE][-+]?\d+)?|^\d+(?:\.\d*)?[eE][-+]?\d+|^\.\d+(?:[eE][-+]?\d+)?/, | |
reRegExp = /^\/((?:\\.|\[(?:\\.|[^\]])*\]|[^\/])+)\/([gimy]*)/, | |
intRegExp = /^0[xX][\da-fA-F]+|^0[0-7]*|^\d+/, | |
multiCommentRegExp = /^\/\*(.|[\r\n])*?\*\//m, | |
commentRegExp = /^\/\/.*/, | |
identRegExp = /^[$_\w]+/, | |
wsRegExp = /^[\ \t]+/, | |
strRegExp = /^'([^'\\]|\\.)*'|^"([^"\\]|\\.)*"/; | |
// Token Class ---------------------------------------------------------------- | |
// ---------------------------------------------------------------------------- | |
function Token() { | |
this.col = 1; | |
this.line = 1; | |
this.ws = { | |
indent: 0, | |
before: 0, | |
after: 0, | |
trailing: 0 | |
}; | |
this.type = null; | |
this.value = null; | |
this.plain = null; | |
} | |
Token.prototype.toString = function() { | |
return '[' + (this.type + ' ').substr(0, 13) + ' ' | |
+ '[' + this.ws.indent + ':' + this.ws.before + ']' + this.line | |
+ ':' + this.col + '[' + this.ws.after + ':' + this.ws.trailing | |
+ ']' + ': ' + this.value + ']'; | |
}; | |
// Main Tokenizer function ---------------------------------------------------- | |
// ---------------------------------------------------------------------------- | |
function tokenize(input, tabWidth) { | |
var cursor = 0, | |
line = 1, | |
col = 0, | |
spaceBefore = 0, | |
indentation = 0, | |
tabExpand = new Array((tabWidth || 4)).join(' '), | |
token = new Token(), | |
lastToken = null, | |
list = []; | |
// Grab the inputs | |
while(cursor < input.length) { | |
// Save the last non-whitespace token | |
if (token.type !== 'WHITESPACE') { | |
lastToken = token; | |
} | |
// Get the rest | |
// We also grab the rest of the line here for regexps | |
var sub = input.substring(cursor), | |
subline = sub.substring(0, sub.indexOf('\n')), | |
m = null; | |
// Create next token | |
token = new Token(); | |
token.line = line; | |
token.col = col; | |
token.ws.indent = indentation; | |
token.ws.before = lastToken.type === 'NEWLINE' ? 0 : spaceBefore; | |
// Reset whitespace | |
spaceBefore = 0; | |
// Newlines | |
if (sub[0] === '\n') { | |
lastToken.ws.trailing = token.ws.before; | |
token.ws.before = 0; | |
token.type = 'NEWLINE'; | |
token.value = '\\n'; | |
token.plain = sub[0]; | |
col = 0; | |
line++; | |
// Multi line comments | |
// don't ask how this regexp works just pray that it will never fail | |
} else if ((m = sub.match(multiCommentRegExp ))) { | |
token.type = 'MULTI_COMMENT'; | |
token.plain = m[0]; | |
token.value = m[0].slice(2, -2); | |
var lines = token.plain.split('\n'); | |
line += lines.length - 1; | |
col = lines[lines.length - 1].length - m[0].length + 1; | |
// Comment | |
} else if ((m = subline.match(commentRegExp))) { | |
token.type = 'COMMENT'; | |
token.plain = m[0]; | |
token.value = m[0].substr(2); | |
// Float | |
} else if ((m = sub.match(fpRegExp))) { | |
token.type = 'FLOAT'; | |
token.plain = m[0]; | |
token.value = parseFloat(m[0]); | |
// Integer | |
} else if ((m = sub.match(intRegExp))) { | |
token.type = 'INTEGER'; | |
token.plain = m[0]; | |
token.value = parseInt(m[0]); | |
// String | |
} else if ((m = sub.match(strRegExp))) { | |
token.type = 'STRING'; | |
token.plain = m[0]; | |
token.value = eval(m[0]); // simpelst way to get the actual js string value, don't beat me, taken from narcissus! | |
// Identifier | |
} else if ((m = sub.match(identRegExp))) { | |
token.value = m[0]; | |
token.type = KEYWORDS.indexOf(token.value) !== -1 ? 'KEYWORD' : 'IDENTIFIER'; | |
// Regexp, matches online on the same line and only if we didn't encounter a identifier right before it | |
} else if (lastToken.type !== 'IDENTIFIER' && (m = subline.match(reRegExp))) { | |
token.type = 'REGEXP'; | |
token.plain = m[0]; | |
token.value = m[1]; | |
token.flags = m[2]; | |
// Operator | |
} else if ((m = sub.match(opRegExp))) { | |
// Check for assignments | |
var op = OPERATORS[m[0]]; | |
if (op.substring(0, 6) === 'ASSIGN') { | |
token.type = 'ASSIGN'; | |
if (op === 'ASSIGN') { | |
token.operator = null; | |
} else { | |
token.operator = op.substring(7); | |
} | |
} else { | |
token.type = op; | |
} | |
token.value = m[0]; | |
// Whitespace handling | |
} else if ((m = sub.match(wsRegExp))) { | |
token.type = 'WHITESPACE'; | |
// Provide meta information about whitespacing | |
spaceBefore = m[0].replace(/\t/g, ' ').length; | |
if (col === 1) { | |
indentation = spaceBefore; | |
} else { | |
lastToken.ws.after = spaceBefore; | |
} | |
// If we ever hit this... we suck | |
} else { | |
throw new Error('Unexpected: ' + sub[0] + ' on :'); | |
} | |
// Add non-whitespace tokens to stream | |
if (token.type !== 'WHITESPACE') { | |
list.push(token); | |
} | |
// Advance cursor by match length | |
var len = 1; | |
if (m) { | |
len = m[0].length + m.index; | |
} | |
cursor += len; | |
col += len; | |
} | |
// Return some API for ya | |
var tokenPos = -1; | |
return { | |
peek: function() { | |
return list[tokenPos + 1]; | |
}, | |
next: function() { | |
return list[++tokenPos]; | |
}, | |
get: function() { | |
return list[tokenPos]; | |
}, | |
all: function() { | |
return list; | |
}, | |
at: function(pos) { | |
return list[pos]; | |
}, | |
reset: function() { | |
tokenPos = 0; | |
} | |
}; | |
} | |
module.exports = tokenize; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment