Last active
August 3, 2017 12:31
-
-
Save evanxg852000/96750c219dd1552d774075d9424617a0 to your computer and use it in GitHub Desktop.
Ultra Tiny Compiler in python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# inspired by https://the-super-tiny-compiler.glitch.me/ | |
''' | |
(print 12) | |
(add 2 2) | |
(subtract 4 2) | |
(add 2 (subtract 4 2)) | |
''' | |
import re | |
import sys | |
class Token(object): | |
def __init__(self, type, value): | |
self.type= type | |
self.value = value | |
def __repr__(self): | |
return '<{}:{}>'.format(self.type, self.value) | |
class Node(Token): | |
def __init__(self, type, value): | |
Token.__init__(self, type, value) | |
self.children = [] | |
def __repr__(self): | |
cstr ='' | |
for child in self.children : | |
cstr += child.__repr__() + '\n' | |
str = '''{}:{} \n{}''' | |
return str.format(self.type, self.value, cstr) | |
# Tokenizer -> tokens (lexems) | |
def lexer(input): | |
tokens = [] | |
pos = 0 #position | |
char = None | |
WHITESPACE = re.compile('\s') | |
FIGURE = re.compile('[0-9]') | |
INDENTIFIER = re.compile('[a-zA-Z]') | |
while (pos < len(input)): | |
char = input[pos] | |
if WHITESPACE.match(char): | |
pos +=1 | |
continue | |
if char == '(': | |
tokens.append(Token('lparen', char)) | |
pos += 1 | |
continue | |
if char == ')': | |
tokens.append(Token('rparen', char)) | |
pos += 1 | |
continue | |
if FIGURE.match(char): | |
num = '' | |
while FIGURE.match(char): | |
num += char | |
pos +=1 # consume the character | |
char = input[pos] | |
tokens.append(Token('number', num)) | |
continue | |
if char == '"': | |
str = '' | |
pos += 1 # skip the opening " | |
char = input[pos] | |
while char !='"': # error checking needed here (let's keep it simple) | |
str += char | |
pos += 1 | |
char = input[pos] | |
pos += 1 # skip the closing " | |
tokens.append(Token('string', str)) | |
continue | |
if INDENTIFIER.match(char): | |
idf = '' | |
while INDENTIFIER.match(char): | |
idf += char | |
pos +=1 # consume the character | |
char = input[pos] | |
tokens.append(Token('identifier', idf)) | |
continue | |
raise 'Unknown character !' | |
return tokens | |
# Parser -> ast | |
def parser (tokens): | |
pos = 0 | |
def walk(): | |
nonlocal pos | |
tk = tokens[pos] | |
if tk.type == 'number': | |
pos += 1 | |
return Node('NumberLiteral', tk.value) | |
if tk.type == 'string': | |
pos += 1 | |
return Node('StringLiteral', tk.value) | |
if tk.type == 'lparen' : | |
# skip lparen | |
pos += 1 | |
tk = tokens[pos] | |
callExpr = Node('CallExpression', tk.value) | |
pos += 1 | |
tk = tokens[pos] | |
while tk.type != 'rparen': # look for closing paren | |
callExpr.children.append(walk()) | |
tk = tokens[pos] | |
pos += 1 | |
return callExpr | |
#raise 'Unknown token!' | |
ast = Node('Program', None) | |
while pos < len(tokens): | |
ast.children.append(walk()) | |
return ast | |
# Interpreter -> run | |
def interpreter(ast): | |
pass # please fill this in as a code kata | |
# code generation | |
def generator (ast): | |
if ast.type == 'Program': | |
template =''' | |
(function(add, sub, mul, div, print){ | |
_code_ | |
})(function (a, b){return a+b;}, | |
function (a, b){return a-b;}, | |
function (a, b){return a*b;}, | |
function (a, b){return a/b;}, | |
console.log) | |
''' | |
code = '' | |
for n in ast.children: | |
code += generator(n) +'; ' | |
return template.replace('_code_', code) | |
if ast.type == 'StringLiteral': | |
return "'{}'".format(ast.value) | |
if ast.type == 'NumberLiteral': | |
return ast.value | |
if ast.type == 'CallExpression': | |
args = '' | |
for n in ast.children: | |
args += generator(n) +', ' | |
args = args.strip(', ') | |
return '{}({})'.format(ast.value, args) | |
return template.replace('_code_', code) | |
def main(argv): | |
# REPL interpreter | |
# if len(argv) == 1 and argv[0] == '-i': | |
# while True: | |
# input = input() | |
# if(input == 'exit'): | |
# break | |
# tokens = lexer(prog) | |
# ast = parser(tokens) | |
# print(interpreter(ast)) | |
prog = ''' | |
(print 12) | |
(print "alex") | |
(print (add 12 (sub 7 3))) | |
''' | |
tokens = lexer(prog) | |
ast = parser(tokens) | |
code = generator(ast) | |
print(code) | |
if __name__ == '__main__': | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment