Created
January 21, 2013 04:00
-
-
Save KartikTalwar/4583510 to your computer and use it in GitHub Desktop.
Optimizing Brainfuck Compiler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Optimizing brainfuck compiler | |
# Copyright (c) 2013 Nayuki Minase | |
# | |
# http://nayuki.eigenstate.org/page/optimizing-brainfuck-compiler | |
# | |
# This script translates brainfuck source code into C/Java/Python source code. | |
# Usage: python bfc.py BrainfuckFile OutputFile.c/java/py | |
# | |
import os, re, sys | |
# ---- Main ---- | |
def main(args): | |
# Handle command-line arguments | |
if len(args) != 2: | |
print >>sys.stderr, "Usage: python bfc.py BrainfuckFile OutputFile.c/java/py" | |
sys.exit(1) | |
inname = args[0] | |
if not os.path.exists(inname): | |
print >>sys.stderr, inname + ": File does not exist" | |
sys.exit(1) | |
if not os.path.isfile(inname): | |
print >>sys.stderr, inname + ": Not a file" | |
sys.exit(1) | |
outname = args[1] | |
if outname.endswith(".c" ): outfunc = commands_to_c | |
elif outname.endswith(".java"): outfunc = commands_to_java | |
elif outname.endswith(".py" ): outfunc = commands_to_python | |
else: | |
print >>sys.stderr, outname + ": Unknown output type" | |
sys.exit(1) | |
# Read input | |
with open(inname, "r") as fin: | |
incode = fin.read() | |
# Parse and optimize Brainfuck code | |
commands = parse(incode) | |
commands = optimize(commands) | |
commands = optimize(commands) | |
commands = optimize(commands) | |
# Write output | |
tempname = os.path.basename(outname) | |
outcode = outfunc(commands, tempname[ : tempname.index(".")]) | |
with open(outname, "w") as fout: | |
fout.write(outcode) | |
# ---- Parser ---- | |
# Parses the given raw code string, returning a list of Command objects. | |
def parse(codestr): | |
codestr = re.sub(r"[^+\-<>.,\[\]]", "", codestr) # Keep only the 8 Brainfuck characters | |
def chargen(): | |
for c in codestr: | |
yield c | |
while True: # At end of stream | |
yield "" | |
return _parse(chargen(), True) | |
def _parse(chargen, maincall): | |
result = [] | |
for c in chargen: | |
if c == "+": result.append(Add(0, +1)) | |
elif c == "-": result.append(Add(0, -1)) | |
elif c == "<": result.append(Right(-1)) | |
elif c == ">": result.append(Right(+1)) | |
elif c == ",": result.append(Input (0)) | |
elif c == ".": result.append(Output(0)) | |
elif c == "[": result.append(Loop(_parse(chargen, False))) | |
elif c == "]": | |
if maincall: raise ValueError("Extra loop closing") | |
else: return result | |
elif c == "": | |
if maincall: return result | |
else: raise ValueError("Unclosed loop") | |
else: | |
raise AssertionError("Illegal code character") | |
# ---- Optimizers ---- | |
# Optimizes the given list of Commands, returning a new list of Commands. | |
def optimize(commands): | |
result = [] | |
offset = 0 # How much the memory pointer has moved without being updated | |
for cmd in commands: | |
if isinstance(cmd, Assign): | |
result.append(Assign(cmd.offset + offset, cmd.value)) | |
elif isinstance(cmd, MultAssign): | |
result.append(MultAssign(cmd.srcOff + offset, cmd.destOff + offset, cmd.value)) | |
elif isinstance(cmd, Add): | |
# Try to fuse into previous command | |
prev = result[-1] if len(result) >= 1 else None | |
if isinstance(prev, Add) and prev.offset == cmd.offset + offset: | |
prev.value = (prev.value + cmd.value) & 0xFF | |
elif isinstance(prev, Assign) and prev.offset == cmd.offset + offset: | |
prev.value = (prev.value + cmd.value) & 0xFF | |
else: | |
result.append(Add(cmd.offset + offset, cmd.value)) | |
elif isinstance(cmd, MultAdd): | |
# Try to fuse into previous command | |
prev = result[-1] if len(result) >= 1 else None | |
if isinstance(prev, Assign) and prev.offset == cmd.destOff + offset and prev.value == 0: | |
result[-1] = MultAssign(cmd.srcOff + offset, cmd.destOff + offset, cmd.value) | |
else: | |
result.append(MultAdd(cmd.srcOff + offset, cmd.destOff + offset, cmd.value)) | |
elif isinstance(cmd, Right): | |
offset += cmd.offset | |
elif isinstance(cmd, Input): | |
result.append(Input(cmd.offset + offset)) | |
elif isinstance(cmd, Output): | |
result.append(Output(cmd.offset + offset)) | |
else: | |
# Commit the pointer movement before starting a loop | |
if offset != 0: | |
result.append(Right(offset)) | |
offset = 0 | |
if isinstance(cmd, Loop): | |
temp = optimize_simple_loop(cmd.commands) | |
if temp is not None: | |
result.extend(temp) | |
else: | |
temp = optimize_complex_loop(cmd.commands) | |
if temp is not None: | |
result.append(temp) | |
else: | |
result.append(Loop(optimize(cmd.commands))) | |
elif isinstance(cmd, If): | |
result.append(If(optimize(cmd.commands))) | |
else: | |
raise AssertionError("Unknown command") | |
# Commit the pointer movement before exiting this block | |
if offset != 0: | |
result.append(Right(offset)) | |
return result | |
# Tries to optimize the given list of looped commands into a list that would be executed without looping. Returns None if not possible. | |
def optimize_simple_loop(commands): | |
deltas = {} # delta[i] = v means that in each loop iteration, mem[p + i] is added by the amount v | |
offset = 0 | |
for cmd in commands: | |
# This implementation can only optimize loops that consist of only Add and Right | |
if isinstance(cmd, Add): | |
off = cmd.offset + offset | |
deltas[off] = deltas.get(off, 0) + cmd.value | |
elif isinstance(cmd, Right): | |
offset += cmd.offset | |
else: | |
return None | |
# Can't optimize if a loop iteration has a net pointer movement, or if the cell being tested isn't decremented by 1 | |
if offset != 0 or deltas.get(0, 0) != -1: | |
return None | |
# Convert the loop into a list of multiply-add commands that source from the cell being tested | |
result = [] | |
del deltas[0] | |
offsets = deltas.keys() | |
offsets.sort() | |
for off in offsets: | |
result.append(MultAdd(0, off, deltas[off])) | |
result.append(Assign(0, 0)) | |
return result | |
def optimize_complex_loop(commands): | |
otherresult = [] | |
deltas = {} | |
clears = set() | |
nonclears = set() | |
for cmd in commands: | |
if isinstance(cmd, Add): | |
deltas[cmd.offset] = deltas.get(cmd.offset, 0) + cmd.value | |
nonclears.add(cmd.offset) | |
elif isinstance(cmd, MultAdd): | |
nonclears.add(cmd.destOff) | |
otherresult.append(MultAdd(cmd.srcOff, cmd.destOff, cmd.value)) | |
elif isinstance(cmd, MultAssign): | |
nonclears.add(cmd.destOff) | |
otherresult.append(MultAssign(cmd.srcOff, cmd.destOff, cmd.value)) | |
elif isinstance(cmd, Assign): | |
(clears if cmd.value == 0 else nonclears).add(cmd.offset) | |
otherresult.append(Assign(cmd.offset, cmd.value)) | |
else: | |
return None | |
if deltas.get(0, 0) != -1: | |
return None | |
for cmd in otherresult: | |
if isinstance(cmd, (MultAdd,MultAssign)) and (cmd.srcOff in nonclears or cmd.srcOff not in clears or cmd.destOff == 0) or \ | |
isinstance(cmd, (Assign,MultAssign)) and cmd.offset in deltas: | |
return None | |
deltaresult = [] | |
del deltas[0] | |
offsets = deltas.keys() | |
offsets.sort() | |
for off in offsets: | |
deltaresult.append(MultAdd(0, off, deltas[off])) | |
deltaresult.append(Assign(0, 0)) | |
return If(deltaresult + otherresult) | |
# ---- Output formatters ---- | |
def commands_to_c(commands, name, maincall=True, indentlevel=1): | |
def indent(line, level=indentlevel): | |
return "\t" * level + line + "\n" | |
result = "" | |
if maincall: | |
result += indent("#include <stdio.h>", 0) | |
result += indent("#include <string.h>", 0) | |
result += indent("", 0) | |
result += indent("int main(int argc, char **argv) {", 0) | |
result += indent("unsigned char mem[1000000];") | |
result += indent("unsigned char *p = &mem[1000];") | |
result += indent("memset(mem, 0, sizeof(mem));") | |
result += indent("") | |
for cmd in commands: | |
if isinstance(cmd, Assign ): result += indent("p[{}] = {};" .format(cmd.offset, cmd.value)) | |
elif isinstance(cmd, Add ): result += indent("p[{}] += {};" .format(cmd.offset, cmd.value)) | |
elif isinstance(cmd, MultAssign): result += indent("p[{}] = p[{}] * {};" .format(cmd.destOff, cmd.srcOff, cmd.value)) | |
elif isinstance(cmd, MultAdd ): result += indent("p[{}] += p[{}] * {};".format(cmd.destOff, cmd.srcOff, cmd.value)) | |
elif isinstance(cmd, Right ): result += indent("p += {};" .format(cmd.offset)) | |
elif isinstance(cmd, Input ): result += indent("p[{}] = getchar();" .format(cmd.offset)) | |
elif isinstance(cmd, Output ): result += indent("putchar(p[{}]);" .format(cmd.offset)) | |
elif isinstance(cmd, If ): | |
result += indent("if (*p != 0) {") | |
result += commands_to_c(cmd.commands, name, False, indentlevel + 1) | |
result += indent("}") | |
elif isinstance(cmd, Loop ): | |
result += indent("while (*p != 0) {") | |
result += commands_to_c(cmd.commands, name, False, indentlevel + 1) | |
result += indent("}") | |
else: raise AssertionError("Unknown command") | |
if maincall: | |
result += indent("") | |
result += indent("return 0;") | |
result += indent("}", 0) | |
return result | |
def commands_to_java(commands, name, maincall=True, indentlevel=2): | |
def indent(line, level=indentlevel): | |
return "\t" * level + line + "\n" | |
result = "" | |
if maincall: | |
result += indent("import java.io.IOException;", 0) | |
result += indent("", 0) | |
result += indent("public class " + name + " {", 0) | |
result += indent("public static void main(String[] args) throws IOException {", 1) | |
result += indent("byte[] mem = new byte[1000000];") | |
result += indent("int i = 1000;") | |
result += indent("") | |
for cmd in commands: | |
if isinstance(cmd, Assign ): result += indent("mem[i + {}] = (byte){};" .format(cmd.offset, cmd.value)) | |
elif isinstance(cmd, Add ): result += indent("mem[i + {}] += {};" .format(cmd.offset, cmd.value)) | |
elif isinstance(cmd, MultAssign): | |
if cmd.value == 1: | |
result += indent("mem[i + {}] = mem[i + {}];".format(cmd.destOff, cmd.srcOff)) | |
else: | |
result += indent("mem[i + {}] = (byte)(mem[i + {}] * {});".format(cmd.destOff, cmd.srcOff, cmd.value)) | |
elif isinstance(cmd, MultAdd ): | |
if cmd.value == 1: | |
result += indent("mem[i + {}] += mem[i + {}];".format(cmd.destOff, cmd.srcOff)) | |
else: | |
result += indent("mem[i + {}] += mem[i + {}] * {};".format(cmd.destOff, cmd.srcOff, cmd.value)) | |
elif isinstance(cmd, Right ): result += indent("i += {};" .format(cmd.offset)) | |
elif isinstance(cmd, Input ): result += indent("mem[{}] = (byte)System.in.read();" .format(cmd.offset)) | |
elif isinstance(cmd, Output ): result += indent("System.out.write(mem[i + {}]);" .format(cmd.offset)) + indent("System.out.flush();") | |
elif isinstance(cmd, If ): | |
result += indent("if (mem[i] != 0) {") | |
result += commands_to_java(cmd.commands, name, False, indentlevel + 1) | |
result += indent("}") | |
elif isinstance(cmd, Loop ): | |
result += indent("while (mem[i] != 0) {") | |
result += commands_to_java(cmd.commands, name, False, indentlevel + 1) | |
result += indent("}") | |
else: raise AssertionError("Unknown command") | |
if maincall: | |
result += indent("}", 1) | |
result += indent("}", 0) | |
return result | |
def commands_to_python(commands, name, maincall=True, indentlevel=0): | |
def indent(line, level=indentlevel): | |
return "\t" * level + line + "\n" | |
result = "" | |
if maincall: | |
result += indent("import sys") | |
result += indent("") | |
result += indent("mem = [0] * 1000000") | |
result += indent("i = 1000") | |
result += indent("") | |
for cmd in commands: | |
if isinstance(cmd, Assign ): result += indent("mem[i + {}] = {}".format(cmd.offset, cmd.value)) | |
elif isinstance(cmd, Add ): result += indent("mem[i + {}] = (mem[i + {}] + {}) & 0xFF".format(cmd.offset, cmd.offset, cmd.value)) | |
elif isinstance(cmd, MultAssign): result += indent("mem[i + {}] = (mem[i + {}] * {}) & 0xFF".format(cmd.destOff, cmd.srcOff, cmd.value)) | |
elif isinstance(cmd, MultAdd ): result += indent("mem[i + {}] = (mem[i + {}] + mem[i + {}] * {}) & 0xFF".format(cmd.destOff, cmd.destOff, cmd.srcOff, cmd.value)) | |
elif isinstance(cmd, Right ): result += indent("i += {}".format(cmd.offset)) | |
elif isinstance(cmd, Input ): result += indent("mem[{}] = ord(sys.stdin.read())".format(cmd.offset)) | |
elif isinstance(cmd, Output ): result += indent("sys.stdout.write(chr(mem[i + {}]))".format(cmd.offset)) | |
elif isinstance(cmd, If ): | |
result += indent("if mem[i] != 0:") | |
result += commands_to_python(cmd.commands, name, False, indentlevel + 1) | |
elif isinstance(cmd, Loop ): | |
result += indent("while mem[i] != 0:") | |
result += commands_to_python(cmd.commands, name, False, indentlevel + 1) | |
else: raise AssertionError("Unknown command") | |
return result | |
# ---- Intermediate representation (IR) ---- | |
class Command(object): # Common superclass | |
pass | |
class Assign(Command): | |
def __init__(self, offset, value): | |
self.offset = offset | |
self.value = value | |
class Add(Command): | |
def __init__(self, offset, value): | |
self.offset = offset | |
self.value = value | |
class MultAssign(Command): | |
def __init__(self, srcOff, destOff, value): | |
self.srcOff = srcOff | |
self.destOff = destOff | |
self.value = value | |
class MultAdd(Command): | |
def __init__(self, srcOff, destOff, value): | |
self.srcOff = srcOff | |
self.destOff = destOff | |
self.value = value | |
class Right(Command): | |
def __init__(self, offset): | |
self.offset = offset | |
class Input(Command): | |
def __init__(self, offset): | |
self.offset = offset | |
class Output(Command): | |
def __init__(self, offset): | |
self.offset = offset | |
class If(Command): | |
def __init__(self, commands): | |
self.commands = commands | |
class Loop(Command): | |
def __init__(self, commands): | |
self.commands = commands | |
# ---- Miscellaneous ---- | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment