Last active
April 16, 2021 17:43
-
-
Save lilpolymath/62f343f040117fe4fd65771d98f9d577 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const tokenizer = code => { | |
let cursor = 0; | |
let tokens = []; | |
const KEYWORDS = /\b(False|None|True|and|as|assert|async|await|break|class|continue|def|del|elif|else|except|finally|for|from|global|if|import|in|is|lambda|nonlocal|not|or|pass|raise|return|try|while|with|yield)\b/; | |
const IDENTIFIERS = /([A-Za-z_][A-Za-z0-9_]*)/; | |
const LETTER = /([A-Za-z]|a-z][_])/; | |
const DIGITS = /[0-9]+/; | |
const NUMBER = /[0-9]/; | |
const NUMALPHABET = /([0-9]|[j]|[.])/; | |
const FLOAT = /([0-9]*[.])[0-9]+/; | |
const IMAGINARY = /([0-9]+[.])*[0-9]+j/; | |
const OPERATORS = /[+|-|*|/|//|%-|**|=|!=|<|>|<=|>=|&|\\|||~|^|<<|>]/; | |
const LOGIC = /\b(and|in|is|not|or)\b/; | |
const QUOTES = /(["]|[']|[’])/; | |
const DELIMITERS = /[\\[|\\]|\\{|\\}|.|,|:|;|@|=|\\+=|-=|\\*=]/; | |
const COMMENT = /#/; | |
const CARRIAGE_RETURN = /\r/; | |
const NEWLINE = /\n/; | |
const LEFTPAREN = /\(/; | |
const RIGHTPAREN = /\)/; | |
const WHITESPACE = /\s/; | |
while (cursor < code.length) { | |
let char = code[cursor]; | |
if (LEFTPAREN.test(char)) { | |
tokens.push({ | |
type: 'LPAREN', | |
token: char, | |
}); | |
cursor++; | |
continue; | |
} | |
if (RIGHTPAREN.test(char)) { | |
tokens.push({ | |
type: 'RPAREN', | |
token: char, | |
}); | |
cursor++; | |
continue; | |
} | |
if (NUMBER.test(char)) { | |
let value = ''; | |
while (NUMALPHABET.test(char)) { | |
value += char; | |
char = code[++cursor]; | |
} | |
if (FLOAT.test(value)) { | |
tokens.push({ | |
type: 'FLOAT', | |
token: value, | |
}); | |
} else if (IMAGINARY.test(value)) { | |
tokens.push({ | |
type: 'IMAGINARY', | |
token: value, | |
}); | |
} else if (DIGITS.test(value)) { | |
tokens.push({ | |
type: 'NUMBER', | |
token: value, | |
}); | |
} | |
continue; | |
} | |
if (QUOTES.test(char)) { | |
const START_QUOTE = char; | |
let value = ''; | |
let next = cursor + 1; | |
let next_char = code[next]; | |
while (next_char !== START_QUOTE && cursor < code.length) { | |
value += char; | |
++cursor; | |
char = code[cursor]; | |
next_char = char; | |
} | |
value += char; | |
cursor++; | |
tokens.push({ | |
type: 'STRING', | |
token: value, | |
}); | |
continue; | |
} | |
if (LETTER.test(char)) { | |
let value = ''; | |
while (LETTER.test(char)) { | |
value += char; | |
++cursor; | |
char = code[cursor]; | |
} | |
if (KEYWORDS.test(value)) { | |
tokens.push({ | |
type: 'KEYWORD', | |
token: value, | |
}); | |
} else if (LOGIC.test(value)) { | |
tokens.push({ | |
type: 'LOGICAL OPERATOR', | |
token: value, | |
}); | |
} else if (char === '.') { | |
tokens.push({ | |
type: 'MODULE', | |
token: value, | |
}); | |
} else if (LEFTPAREN.test(char) || char === '.') { | |
tokens.push({ | |
type: 'FUNCTION', | |
token: value, | |
}); | |
} else if (IDENTIFIERS.test(value)) { | |
tokens.push({ | |
type: 'IDENTIFIER', | |
token: value, | |
}); | |
} | |
continue; | |
} | |
if (OPERATORS.test(char)) { | |
tokens.push({ | |
type: 'OPERATOR', | |
token: char, | |
}); | |
cursor++; | |
continue; | |
} | |
if (NEWLINE.test(char)) { | |
tokens.push({ | |
type: 'NEWLINE', | |
token: char, | |
}); | |
cursor++; | |
continue; | |
} | |
if (CARRIAGE_RETURN.test(char)) { | |
tokens.push({ | |
type: 'NEWLINE', | |
token: char, | |
}); | |
cursor++; | |
continue; | |
} | |
if (COMMENT.test(char)) { | |
while (!(NEWLINE.test(char) || CARRIAGE_RETURN.test(char))) { | |
char = code[++cursor]; | |
} | |
tokens.push({ | |
type: 'NEWLINE', | |
token: char, | |
}); | |
cursor++; | |
continue; | |
} | |
if (WHITESPACE.test(char)) { | |
tokens.push({ | |
type: 'WHITESPACE', | |
token: char, | |
}); | |
cursor++; | |
continue; | |
} | |
throw new TypeError('I dont know what this character is: ' + char); | |
} | |
return tokens; | |
}; | |
// Make sure we got a filename on the command line. | |
if (process.argv.length < 3) { | |
console.log('Usage: node ' + process.argv[1] + ' FILENAME'); | |
process.exit(1); | |
} | |
// Read the file and the result of tokenization. | |
var fs = require('fs'), | |
filename = process.argv[2]; | |
fs.readFile(filename, 'utf8', function(err, data) { | |
if (err) throw err; | |
console.log('TOKENS:', tokenizer(data)); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Takes a python code like
this is the output