Created
January 9, 2017 05:01
-
-
Save kevinduh/d562dca0c845bc88a8ba0b9ad2af07a6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import theano | |
import theano.tensor as T | |
import numpy as np | |
import sys | |
def create_ngram_data(input_file, ngram_size): | |
'''Reads input_file and returns a character ngram dataset | |
where each row is an ngram [char1, char2, char3,.. charN], | |
with N=ngram_size, and the characters are mapped to ASCII id's. | |
''' | |
data = [] | |
with open(input_file, 'r') as f: | |
for line in f: | |
for i in xrange(len(line) - ngram_size): | |
data.append([ord(c) for c in line[i:i+ngram_size]]) | |
return data | |
def perplexity(testdata, ngram_size, inference_model): | |
'''Computes perplexity of the provided inference model on an ngram dataset | |
''' | |
nn = ngram_size - 1 | |
expn = 0.0 | |
count = 0 | |
for d in testdata: | |
prob = inference_model(d[0:nn])[0][d[nn]] | |
expn -= np.log2(prob) | |
count +=1 | |
return 2**(expn/count) | |
def generate_sample(initial_input, inference_model): | |
'''Generate a random text sample given initial input and an inference model | |
''' | |
inp = initial_input | |
print ' '.join([chr(i) for i in initial_input]), | |
for i in range(30): | |
o = np.random.choice(128, 1, p=inference_model(inp)[0])[0] | |
#o = inference_model(inp)[0].argmax() | |
print chr(o), | |
inp.pop(0) | |
inp.append(o) | |
def create_graphs(ngram_size, vocab_size, embedding_size, hidden_size): | |
'''Returns the (inference graph, training graph) of a feedforward neural language model | |
Assume a 5-gram setup, where input context = [char1,char2,char3,char4] to predict [char5] | |
- inference_graph([char1,char2,char3,char4]) outputs softmax probability distribution for char5 | |
- train_graph([char1,char2,char3,char4],[char5]) will update neural language model and return training cost for this sample | |
''' | |
# embedding matrix | |
E = theano.shared(np.random.randn(vocab_size, embedding_size)) | |
# input_id is a vector [char1,char2,char3,char4], represented by ASCII values | |
input_id = T.ivector('input_id') | |
# indexes the embedding matrix and concatenates | |
concat_embedding = E[input_id].reshape((1,-1)) | |
# TODO1: MLP code here to connect concat_embedding to output | |
# output = T.nnet.softmax... | |
# .... | |
# inference_model = theano.function([input_id], output) | |
# TODO2: code here for training. output_id is target to predict, i.e. char5 | |
# output_id = T.iscalar('output_id') | |
# .... | |
# train_model = theano.function([input_id, output_id], cost, updates=... | |
return inference_model, train_model | |
# 0: run program: "python nlm.py textfile.txt" where textfile is an ASCII text | |
input_text = sys.argv[1] | |
# 1: create dataset. we'll have 5-gram language models, i.e. given 4 characters, predict the 5th | |
ngram_size = 5 | |
data = create_ngram_data(input_text,ngram_size) | |
traindata, testdata = data[0:int(len(data)*0.8)], data[int(len(data)*0.8):] | |
# 2: create computation graphs | |
inference_graph, train_graph = create_graphs(ngram_size, vocab_size=128, embedding_size=7, hidden_size=25) | |
# 3: training loop | |
for epoch in range(10): | |
cumulative_cost = 0 | |
for d in traindata: | |
nn = ngram_size - 1 | |
c = train_graph(d[0:nn],d[nn]) | |
cumulative_cost += c | |
print "Epoch=%d CumulativeCost=%f" %(epoch, cumulative_cost), | |
print "TrainPerplexity=%f TestPerplexity=%f" % (perplexity(traindata,ngram_size,inference_graph), | |
perplexity(testdata,ngram_size,inference_graph)) | |
for i in range(3): | |
print "sample: ", | |
generate_sample(data[i][0:nn],inference_graph) | |
print "" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment