Skip to content

Instantly share code, notes, and snippets.

@lilpolymath
Last active April 16, 2021 17:43
Show Gist options
  • Save lilpolymath/62f343f040117fe4fd65771d98f9d577 to your computer and use it in GitHub Desktop.
Save lilpolymath/62f343f040117fe4fd65771d98f9d577 to your computer and use it in GitHub Desktop.
const tokenizer = code => {
let cursor = 0;
let tokens = [];
const KEYWORDS = /\b(False|None|True|and|as|assert|async|await|break|class|continue|def|del|elif|else|except|finally|for|from|global|if|import|in|is|lambda|nonlocal|not|or|pass|raise|return|try|while|with|yield)\b/;
const IDENTIFIERS = /([A-Za-z_][A-Za-z0-9_]*)/;
const LETTER = /([A-Za-z]|a-z][_])/;
const DIGITS = /[0-9]+/;
const NUMBER = /[0-9]/;
const NUMALPHABET = /([0-9]|[j]|[.])/;
const FLOAT = /([0-9]*[.])[0-9]+/;
const IMAGINARY = /([0-9]+[.])*[0-9]+j/;
const OPERATORS = /[+|-|*|/|//|%-|**|=|!=|<|>|<=|>=|&|\\|||~|^|<<|>]/;
const LOGIC = /\b(and|in|is|not|or)\b/;
const QUOTES = /(["]|[']|[’])/;
const DELIMITERS = /[\\[|\\]|\\{|\\}|.|,|:|;|@|=|\\+=|-=|\\*=]/;
const COMMENT = /#/;
const CARRIAGE_RETURN = /\r/;
const NEWLINE = /\n/;
const LEFTPAREN = /\(/;
const RIGHTPAREN = /\)/;
const WHITESPACE = /\s/;
while (cursor < code.length) {
let char = code[cursor];
if (LEFTPAREN.test(char)) {
tokens.push({
type: 'LPAREN',
token: char,
});
cursor++;
continue;
}
if (RIGHTPAREN.test(char)) {
tokens.push({
type: 'RPAREN',
token: char,
});
cursor++;
continue;
}
if (NUMBER.test(char)) {
let value = '';
while (NUMALPHABET.test(char)) {
value += char;
char = code[++cursor];
}
if (FLOAT.test(value)) {
tokens.push({
type: 'FLOAT',
token: value,
});
} else if (IMAGINARY.test(value)) {
tokens.push({
type: 'IMAGINARY',
token: value,
});
} else if (DIGITS.test(value)) {
tokens.push({
type: 'NUMBER',
token: value,
});
}
continue;
}
if (QUOTES.test(char)) {
const START_QUOTE = char;
let value = '';
let next = cursor + 1;
let next_char = code[next];
while (next_char !== START_QUOTE && cursor < code.length) {
value += char;
++cursor;
char = code[cursor];
next_char = char;
}
value += char;
cursor++;
tokens.push({
type: 'STRING',
token: value,
});
continue;
}
if (LETTER.test(char)) {
let value = '';
while (LETTER.test(char)) {
value += char;
++cursor;
char = code[cursor];
}
if (KEYWORDS.test(value)) {
tokens.push({
type: 'KEYWORD',
token: value,
});
} else if (LOGIC.test(value)) {
tokens.push({
type: 'LOGICAL OPERATOR',
token: value,
});
} else if (char === '.') {
tokens.push({
type: 'MODULE',
token: value,
});
} else if (LEFTPAREN.test(char) || char === '.') {
tokens.push({
type: 'FUNCTION',
token: value,
});
} else if (IDENTIFIERS.test(value)) {
tokens.push({
type: 'IDENTIFIER',
token: value,
});
}
continue;
}
if (OPERATORS.test(char)) {
tokens.push({
type: 'OPERATOR',
token: char,
});
cursor++;
continue;
}
if (NEWLINE.test(char)) {
tokens.push({
type: 'NEWLINE',
token: char,
});
cursor++;
continue;
}
if (CARRIAGE_RETURN.test(char)) {
tokens.push({
type: 'NEWLINE',
token: char,
});
cursor++;
continue;
}
if (COMMENT.test(char)) {
while (!(NEWLINE.test(char) || CARRIAGE_RETURN.test(char))) {
char = code[++cursor];
}
tokens.push({
type: 'NEWLINE',
token: char,
});
cursor++;
continue;
}
if (WHITESPACE.test(char)) {
tokens.push({
type: 'WHITESPACE',
token: char,
});
cursor++;
continue;
}
throw new TypeError('I dont know what this character is: ' + char);
}
return tokens;
};
// Make sure we got a filename on the command line.
if (process.argv.length < 3) {
console.log('Usage: node ' + process.argv[1] + ' FILENAME');
process.exit(1);
}
// Read the file and the result of tokenization.
var fs = require('fs'),
filename = process.argv[2];
fs.readFile(filename, 'utf8', function(err, data) {
if (err) throw err;
console.log('TOKENS:', tokenizer(data));
});
@lilpolymath
Copy link
Author

Takes a python code like

import math

def gm(x,y):
    #something here sha
    some = 3
    print(2.6 + 4j)
    print('to know you.')
    return math.sqrt(x * y)

this is the output

[
  { type: 'NEWLINE', token: '\n' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'KEYWORD', token: 'def' },
  { type: 'FUNCTION', token: 'gm' },
  { type: 'IDENTIFIER', token: 'x' },
  { type: 'IDENTIFIER', token: 'y' },
  { type: 'OPERATOR', token: ':' },
  { type: 'NEWLINE', token: '\n' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'NEWLINE', token: '\n' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'IDENTIFIER', token: 'some' },
  { type: 'OPERATOR', token: '=' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'NUMBER', token: '3' },
  { type: 'NEWLINE', token: '\n' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'FUNCTION', token: 'print' },
  { type: 'FLOAT', token: '2.6' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'OPERATOR', token: '+' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'IMAGINARY', token: '4j' },
  { type: 'RPAREN', token: ')' },
  { type: 'NEWLINE', token: '\n' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'FUNCTION', token: 'print' },
  { type: 'STRING', token: "'to know you.'" },
  { type: 'RPAREN', token: ')' },
  { type: 'NEWLINE', token: '\n' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'KEYWORD', token: 'return' },
  { type: 'FUNCTION', token: 'math' },
  { type: 'FUNCTION', token: 'sqrt' },
  { type: 'IDENTIFIER', token: 'x' },
  { type: 'OPERATOR', token: '*' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'IDENTIFIER', token: 'y' },
  { type: 'NEWLINE', token: '\n' },
  { type: 'WHITESPACE', token: ' ' },
  { type: 'WHITESPACE', token: ' ' }
]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment