Skip to content

Instantly share code, notes, and snippets.

@HamedFathi
Forked from BonsaiDen/tokenize.js
Created July 10, 2019 08:20
Show Gist options
  • Save HamedFathi/a6598422ac2755b3221a9604c2c67432 to your computer and use it in GitHub Desktop.
Save HamedFathi/a6598422ac2755b3221a9604c2c67432 to your computer and use it in GitHub Desktop.
Small, simple, JS tokenizer in JS.
/**
* Simple JavaScript tokenizer (not a full parser!!!)
*
* Portions taken from Narcissus by Brendan Eich <[email protected]>.
*/
/*jshint evil: true, regexdash: false, regexp: false */
var KEYWORDS = [
'break',
'case', 'catch', 'const', 'continue',
'debugger', 'default', 'delete', 'do',
'else', 'enum',
'false', 'finally', 'for', 'function',
'if', 'in', 'instanceof',
'new', 'null',
'return',
'switch',
'this', 'throw', 'true', 'try', 'typeof',
'var', 'void',
'while', 'with'
];
var OPERATORS = {
'>>>=': 'ASSIGN_URSH',
'>>=': 'ASSIGN_RSH',
'<<=': 'ASSIGN_LSH',
'|=': 'ASSIGN_BITWISE_OR',
'^=': 'ASSIGN_BITWISE_XOR',
'&=': 'ASSIGN_BITWISE_AND',
'+=': 'ASSIGN_PLUS',
'-=': 'ASSIGN_MINUS',
'*=': 'ASSIGN_MUL',
'/=': 'ASSIGN_DIV',
'%=': 'ASSIGN_MOD',
';': 'SEMICOLON',
',': 'COMMA',
'?': 'HOOK',
':': 'COLON',
'||': 'OR',
'&&': 'AND',
'|': 'BITWISE_OR',
'^': 'BITWISE_XOR',
'&': 'BITWISE_AND',
'===': 'STRICT_EQ',
'==': 'EQ',
'=': 'ASSIGN',
'!==': 'STRICT_NE',
'!=': 'NE',
'<<': 'LSH',
'<=': 'LE',
'<': 'LT',
'>>>': 'URSH',
'>>': 'RSH',
'>=': 'GE',
'>': 'GT',
'++': 'INCREMENT',
'--': 'DECREMENT',
'+': 'PLUS',
'-': 'MINUS',
'*': 'MUL',
'/': 'DIV',
'%': 'MOD',
'!': 'NOT',
'~': 'BITWISE_NOT',
'.': 'DOT',
'[': 'LEFT_BRACKET',
']': 'RIGHT_BRACKET',
'{': 'LEFT_CURLY',
'}': 'RIGHT_CURLY',
'(': 'LEFT_PAREN',
')': 'RIGHT_PAREN'
};
// Regular Expressions --------------------------------------------------------
// ----------------------------------------------------------------------------
var opMatch = '^';
for (var i in OPERATORS) {
if (opMatch !== '^') {
opMatch += '|^';
}
opMatch += i.replace(/[?|^&(){}\[\]+\-*\/\.]/g, '\\$&');
}
var opRegExp = new RegExp(opMatch),
fpRegExp = /^\d+\.\d*(?:[eE][-+]?\d+)?|^\d+(?:\.\d*)?[eE][-+]?\d+|^\.\d+(?:[eE][-+]?\d+)?/,
reRegExp = /^\/((?:\\.|\[(?:\\.|[^\]])*\]|[^\/])+)\/([gimy]*)/,
intRegExp = /^0[xX][\da-fA-F]+|^0[0-7]*|^\d+/,
multiCommentRegExp = /^\/\*(.|[\r\n])*?\*\//m,
commentRegExp = /^\/\/.*/,
identRegExp = /^[$_\w]+/,
wsRegExp = /^[\ \t]+/,
strRegExp = /^'([^'\\]|\\.)*'|^"([^"\\]|\\.)*"/;
// Token Class ----------------------------------------------------------------
// ----------------------------------------------------------------------------
function Token() {
this.col = 1;
this.line = 1;
this.ws = {
indent: 0,
before: 0,
after: 0,
trailing: 0
};
this.type = null;
this.value = null;
this.plain = null;
}
Token.prototype.toString = function() {
return '[' + (this.type + ' ').substr(0, 13) + ' '
+ '[' + this.ws.indent + ':' + this.ws.before + ']' + this.line
+ ':' + this.col + '[' + this.ws.after + ':' + this.ws.trailing
+ ']' + ': ' + this.value + ']';
};
// Main Tokenizer function ----------------------------------------------------
// ----------------------------------------------------------------------------
function tokenize(input, tabWidth) {
var cursor = 0,
line = 1,
col = 0,
spaceBefore = 0,
indentation = 0,
tabExpand = new Array((tabWidth || 4)).join(' '),
token = new Token(),
lastToken = null,
list = [];
// Grab the inputs
while(cursor < input.length) {
// Save the last non-whitespace token
if (token.type !== 'WHITESPACE') {
lastToken = token;
}
// Get the rest
// We also grab the rest of the line here for regexps
var sub = input.substring(cursor),
subline = sub.substring(0, sub.indexOf('\n')),
m = null;
// Create next token
token = new Token();
token.line = line;
token.col = col;
token.ws.indent = indentation;
token.ws.before = lastToken.type === 'NEWLINE' ? 0 : spaceBefore;
// Reset whitespace
spaceBefore = 0;
// Newlines
if (sub[0] === '\n') {
lastToken.ws.trailing = token.ws.before;
token.ws.before = 0;
token.type = 'NEWLINE';
token.value = '\\n';
token.plain = sub[0];
col = 0;
line++;
// Multi line comments
// don't ask how this regexp works just pray that it will never fail
} else if ((m = sub.match(multiCommentRegExp ))) {
token.type = 'MULTI_COMMENT';
token.plain = m[0];
token.value = m[0].slice(2, -2);
var lines = token.plain.split('\n');
line += lines.length - 1;
col = lines[lines.length - 1].length - m[0].length + 1;
// Comment
} else if ((m = subline.match(commentRegExp))) {
token.type = 'COMMENT';
token.plain = m[0];
token.value = m[0].substr(2);
// Float
} else if ((m = sub.match(fpRegExp))) {
token.type = 'FLOAT';
token.plain = m[0];
token.value = parseFloat(m[0]);
// Integer
} else if ((m = sub.match(intRegExp))) {
token.type = 'INTEGER';
token.plain = m[0];
token.value = parseInt(m[0]);
// String
} else if ((m = sub.match(strRegExp))) {
token.type = 'STRING';
token.plain = m[0];
token.value = eval(m[0]); // simpelst way to get the actual js string value, don't beat me, taken from narcissus!
// Identifier
} else if ((m = sub.match(identRegExp))) {
token.value = m[0];
token.type = KEYWORDS.indexOf(token.value) !== -1 ? 'KEYWORD' : 'IDENTIFIER';
// Regexp, matches online on the same line and only if we didn't encounter a identifier right before it
} else if (lastToken.type !== 'IDENTIFIER' && (m = subline.match(reRegExp))) {
token.type = 'REGEXP';
token.plain = m[0];
token.value = m[1];
token.flags = m[2];
// Operator
} else if ((m = sub.match(opRegExp))) {
// Check for assignments
var op = OPERATORS[m[0]];
if (op.substring(0, 6) === 'ASSIGN') {
token.type = 'ASSIGN';
if (op === 'ASSIGN') {
token.operator = null;
} else {
token.operator = op.substring(7);
}
} else {
token.type = op;
}
token.value = m[0];
// Whitespace handling
} else if ((m = sub.match(wsRegExp))) {
token.type = 'WHITESPACE';
// Provide meta information about whitespacing
spaceBefore = m[0].replace(/\t/g, ' ').length;
if (col === 1) {
indentation = spaceBefore;
} else {
lastToken.ws.after = spaceBefore;
}
// If we ever hit this... we suck
} else {
throw new Error('Unexpected: ' + sub[0] + ' on :');
}
// Add non-whitespace tokens to stream
if (token.type !== 'WHITESPACE') {
list.push(token);
}
// Advance cursor by match length
var len = 1;
if (m) {
len = m[0].length + m.index;
}
cursor += len;
col += len;
}
// Return some API for ya
var tokenPos = -1;
return {
peek: function() {
return list[tokenPos + 1];
},
next: function() {
return list[++tokenPos];
},
get: function() {
return list[tokenPos];
},
all: function() {
return list;
},
at: function(pos) {
return list[pos];
},
reset: function() {
tokenPos = 0;
}
};
}
module.exports = tokenize;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment