Last active
March 1, 2017 00:04
-
-
Save wallabra/81ae18cb05909b15d9460542e46d0e9e to your computer and use it in GitHub Desktop.
Failed attempt at some Cyberspeare
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import cPickle as pk | |
import random | |
import re | |
import difflib | |
from pybrain.tools.shortcuts import buildNetwork as new_net | |
from pybrain.datasets import SupervisedDataSet | |
from pybrain.supervised.trainers import BackpropTrainer | |
from pybrain.structure import SoftmaxLayer | |
ic = "abcdefghijklmnopqrstuvwxyz-_!?,.;:@#$ " | |
num_seeds = 150 | |
num_samples = 80 | |
seed_size = 250 | |
result_size = 85 | |
iterations = 400 | |
def random_segment(s, size): | |
pos = random.randint(0, len(s) - (size + 1)) | |
return s[pos:pos + size] | |
def cap(s, min_len, default=" "): | |
return s + default * max(min_len - len(s), 0) | |
def tesselate(l, size, sep=" ", fill=" "): | |
r = [] | |
while len(sep.join(r)) < size: | |
r.append(random.choice(l)) | |
a = sep.join(r[:-1]) | |
return cap(a, size, fill) | |
def character_pos(c): | |
return (float(list(ic).index(c)) / float(len(ic))) * 2.0 - 1.0 | |
def de_character_pos(f): | |
try: | |
return ic[int(((f + 1.0) / 2.0) * len(ic))] | |
except IndexError: | |
return "" | |
def char_map(s): | |
return [character_pos(c) for c in s if c in ic] | |
def de_char_map(s): | |
return "".join([de_character_pos(c) for c in s]) | |
assert de_char_map(char_map("hello world!")) == "hello world!" | |
print "Setting up..." | |
chars = "".join([x for x in re.sub(r'\s+', ' ', open('corpus.txt').read().lower()) if x in ic]) | |
A = len(chars) # number of letters | |
_in = char_map(chars) | |
words = chars.split(" ") | |
seeds = [tesselate(words, ) for _ in xrange(num_seeds)] | |
fs = [char_map(s) for s in seeds] | |
word_size = max(len(s) for s in words) | |
ds = SupervisedDataSet(seed_size, result_size) | |
print "Filling dataset..." | |
for _ in xrange(num_samples): | |
sd = random.choice(fs) | |
c = tesselate(words, result_size) | |
print "[training] {} -> {}\n".format(de_char_map(sd), c) | |
ds.addSample(sd, char_map(c)) | |
# create a model to train: input -> gru -> relu -> softmax. | |
try: | |
net = pk.load(open("thisnet.pickle")) | |
except IOError: | |
net = new_net(seed_size, (len(ic) * seed_size + result_size) / 2, result_size, hiddenclass=SoftmaxLayer) | |
trainer = BackpropTrainer(net, ds) | |
print "Seeding..." | |
seeds = ["".join([random.choice(ic) for _ in xrange(seed_size)]) for _ in xrange(num_seeds)] | |
fs = [char_map(s) for s in seeds] | |
print "Training..." | |
# train the model iteratively; draw a sample after every epoch. | |
for i in xrange(iterations): | |
print "{}:".format(i), | |
diff = trainer.train() | |
print ".", | |
seed = random.choice(fs) | |
print "{} -> {} ({}%)\n".format(de_char_map(seed), de_char_map(net.activate(seed)), 100.0 - diff * 100) | |
open("thisnet.pickle", "w").write(pk.dumps(net)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment