Skip to content

Instantly share code, notes, and snippets.

@kastnerkyle
Last active February 3, 2016 02:08
Show Gist options
  • Select an option

  • Save kastnerkyle/fd7cb8a5e0383814121c to your computer and use it in GitHub Desktop.

Select an option

Save kastnerkyle/fd7cb8a5e0383814121c to your computer and use it in GitHub Desktop.
An approach to Gaussian attention as per Graves' Generating Sequences with Recurrent Neural Networks
"""
bitmap utils from Shawn Tan
"""
# Author: Kyle Kastner
# License: BSD 3-clause
# Equation 46 from http://arxiv.org/abs/1308.0850
from theano import tensor
from scipy import linalg
import theano
import numpy as np
import matplotlib.pyplot as plt
import time
eps = 1E-12
characters = np.array([
0x0,
0x808080800080000,
0x2828000000000000,
0x287C287C280000,
0x81E281C0A3C0800,
0x6094681629060000,
0x1C20201926190000,
0x808000000000000,
0x810202010080000,
0x1008040408100000,
0x2A1C3E1C2A000000,
0x8083E08080000,
0x81000,
0x3C00000000,
0x80000,
0x204081020400000,
0x1824424224180000,
0x8180808081C0000,
0x3C420418207E0000,
0x3C420418423C0000,
0x81828487C080000,
0x7E407C02423C0000,
0x3C407C42423C0000,
0x7E04081020400000,
0x3C423C42423C0000,
0x3C42423E023C0000,
0x80000080000,
0x80000081000,
0x6186018060000,
0x7E007E000000,
0x60180618600000,
0x3844041800100000,
0x3C449C945C201C,
0x1818243C42420000,
0x7844784444780000,
0x3844808044380000,
0x7844444444780000,
0x7C407840407C0000,
0x7C40784040400000,
0x3844809C44380000,
0x42427E4242420000,
0x3E080808083E0000,
0x1C04040444380000,
0x4448507048440000,
0x40404040407E0000,
0x4163554941410000,
0x4262524A46420000,
0x1C222222221C0000,
0x7844784040400000,
0x1C222222221C0200,
0x7844785048440000,
0x1C22100C221C0000,
0x7F08080808080000,
0x42424242423C0000,
0x8142422424180000,
0x4141495563410000,
0x4224181824420000,
0x4122140808080000,
0x7E040810207E0000,
0x3820202020380000,
0x4020100804020000,
0x3808080808380000,
0x1028000000000000,
0x7E0000,
0x1008000000000000,
0x3C023E463A0000,
0x40407C42625C0000,
0x1C20201C0000,
0x2023E42463A0000,
0x3C427E403C0000,
0x18103810100000,
0x344C44340438,
0x2020382424240000,
0x800080808080000,
0x800180808080870,
0x20202428302C0000,
0x1010101010180000,
0x665A42420000,
0x2E3222220000,
0x3C42423C0000,
0x5C62427C4040,
0x3A46423E0202,
0x2C3220200000,
0x1C201804380000,
0x103C1010180000,
0x2222261A0000,
0x424224180000,
0x81815A660000,
0x422418660000,
0x422214081060,
0x3C08103C0000,
0x1C103030101C0000,
0x808080808080800,
0x38080C0C08380000,
0x324C000000,
], dtype=np.uint64)
bitmap = np.unpackbits(characters.view(np.uint8)).reshape(characters.shape[0],
8, 8)
bitmap = bitmap[:, ::-1, :]
chars = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`"
chars += "abcdefghijklmnopqrstuvwxyz{|}~"
mapping = {c: i for i, c in enumerate(chars)}
def string_to_image(string):
return np.hstack(np.array([bitmap[mapping[c]] for c in string])).T[:, ::-1]
def string_to_index(string):
return np.asarray([mapping[c] for c in string])
def index_to_onehot(indexes):
o = np.zeros(
(indexes.shape[0], indexes.shape[1], len(chars))).astype("int32")
for n, i in enumerate(indexes):
o[n, np.arange(indexes.shape[1]), i] = 1
return o
def onehot_to_index(one_hot):
return np.argmax(one_hot, axis=-1)
class adadelta(object):
"""
An adaptive learning rate optimizer
For more information, see:
Matthew D. Zeiler, "ADADELTA: An Adaptive Learning Rate Method"
arXiv:1212.5701.
"""
def __init__(self, params, running_grad_decay=0.95, running_up_decay=0.95,
eps=1E-6):
self.running_grad_decay = running_grad_decay
self.running_up_decay = running_up_decay
self.eps = eps
self.running_up2_ = [theano.shared(np.zeros_like(p.get_value()))
for p in params]
self.running_grads2_ = [theano.shared(np.zeros_like(p.get_value()))
for p in params]
self.previous_grads_ = [theano.shared(np.zeros_like(p.get_value()))
for p in params]
def updates(self, params, grads):
running_grad_decay = self.running_grad_decay
running_up_decay = self.running_up_decay
eps = self.eps
updates = []
for n, (param, grad) in enumerate(zip(params, grads)):
running_grad2 = self.running_grads2_[n]
running_up2 = self.running_up2_[n]
previous_grad = self.previous_grads_[n]
rg2up = running_grad_decay * running_grad2 + (
1. - running_grad_decay) * (grad ** 2)
updir = -tensor.sqrt(running_up2 + eps) / tensor.sqrt(
running_grad2 + eps) * previous_grad
ru2up = running_up_decay * running_up2 + (
1. - running_up_decay) * (updir ** 2)
updates.append((previous_grad, grad))
updates.append((running_grad2, rg2up))
updates.append((running_up2, ru2up))
updates.append((param, param + updir))
return updates
def make_minibatch_from_strings(strings):
X_shapes = [string_to_image(s).shape for s in strings]
y_shapes = [string_to_index(s).shape for s in strings]
max_X_len = max([sh[0] for sh in X_shapes])
max_y_len = max([sh[0] for sh in y_shapes])
minibatch_size = len(strings)
# assume all feature dimensions are equal!
X_mb = np.zeros((max_X_len, minibatch_size, X_shapes[-1][1])).astype(
theano.config.floatX)
X_mask = np.zeros((max_X_len, len(strings))).astype(theano.config.floatX)
y_mb = np.zeros((max_y_len, minibatch_size)).astype("int32")
y_mask = np.ones_like(y_mb).astype(theano.config.floatX)
for n, s in enumerate(strings):
X = string_to_image(s)
y = string_to_index(s)
X_mb[:X.shape[0], n, :] = X
X_mask[:X.shape[0], n] = 1.
y_mb[:y.shape[0], n] = y
y_mask[:y.shape[0], n] = 1.
return X_mb, X_mask, y_mb, y_mask
def as_shared(arr, name=None):
if type(arr) in [float, int]:
if name is not None:
return theano.shared(np.cast[theano.config.floatX](arr))
else:
return theano.shared(np.cast[theano.config.floatX](arr), name=name)
if name is not None:
return theano.shared(value=arr, borrow=True)
else:
return theano.shared(value=arr, name=name, borrow=True)
def np_zeros(shape):
""" Builds a numpy variable filled with zeros """
return np.zeros(shape).astype(theano.config.floatX)
def np_ones(shape):
""" Builds a numpy variable filled with zeros """
return np.ones(shape).astype(theano.config.floatX)
def np_rand(shape, random_state):
# Make sure bounds aren't the same
return random_state.uniform(low=-0.08, high=0.08, size=shape).astype(
theano.config.floatX)
def np_randn(shape, random_state):
""" Builds a numpy variable filled with random normal values """
return (0.01 * random_state.randn(*shape)).astype(theano.config.floatX)
def np_tanh_fan(shape, random_state):
# The . after the 6 is critical! shape has dtype int...
bound = np.sqrt(6. / np.sum(shape))
return random_state.uniform(low=-bound, high=bound,
size=shape).astype(theano.config.floatX)
def np_sigmoid_fan(shape, random_state):
return 4 * np_tanh_fan(shape, random_state)
def np_ortho(shape, random_state):
""" Builds a theano variable filled with orthonormal random values """
g = random_state.randn(*shape)
o_g = linalg.svd(g)[0]
return o_g.astype(theano.config.floatX)
def build_model(X, X_mask, y, y_mask, minibatch_size, input_size,
cond_size, hidden_size, output_size, n_gaussians=5):
random_state = np.random.RandomState(1999)
# Input to hidden weights
W_input_hidden = as_shared(np_tanh_fan((input_size, hidden_size),
random_state))
b_input = as_shared(np_zeros((hidden_size,)))
# Conditioning to hidden weights
W_cond_hidden = as_shared(np_tanh_fan((cond_size, hidden_size),
random_state))
b_cond = as_shared(np_zeros((hidden_size,)))
# alpha, beta, kappa = 3 x n_gaussians
W_window = as_shared(np_tanh_fan((hidden_size, 3 * n_gaussians),
random_state))
b_window = as_shared(np_zeros((3 * n_gaussians),))
# Hidden to hidden
W_hidden_hidden = as_shared(np_ortho((hidden_size, hidden_size),
random_state))
# Hidden to output
W_hidden_output = as_shared(np_tanh_fan((hidden_size, output_size),
random_state))
b_output = as_shared(np_zeros((output_size,)))
# Zeros init
initial_hidden = as_shared(np_zeros((minibatch_size, hidden_size)))
initial_kappa = as_shared(np_zeros((minibatch_size, n_gaussians)))
proj_X = tensor.dot(X, W_input_hidden) + b_input
def _slice(arr, n):
# First slice is tensor_dim - 1 sometimes with scan...
# need to be *very* careful
dim = n_gaussians
if arr.ndim == 3:
return arr[:, :, n * dim:(n + 1) * dim]
return arr[:, n * dim:(n + 1) * dim]
def step(x_t, xm_t, h_tm1, k_tm1, c, cm, U, W_c, b_c, W_ch, b_ch, s):
window_ti = tensor.dot(h_tm1, W_c) + b_c
alpha_t = tensor.exp(_slice(window_ti, 0))
beta_t = tensor.exp(_slice(window_ti, 1))
# No log 1 + gives very different results...
kappa_t = k_tm1 + tensor.exp(
_slice(window_ti, 2) + 1E-6)
theano.printing.Print("k")(kappa_t.shape)
theano.printing.Print("s")(s.shape)
sq_tx = (kappa_t[:, None, :] - s[None, :, None]) ** 2
theano.printing.Print("sq_tx")(sq_tx.shape)
theano.printing.Print("alpha_t")(alpha_t.shape)
theano.printing.Print("beta_t")(beta_t.shape)
mixture_t = alpha_t[:, None, :] * tensor.exp(
-beta_t[:, None, :] * sq_tx)
theano.printing.Print("mixture_t")(mixture_t.shape)
phi_tx = mixture_t.sum(axis=2)
theano.printing.Print("phi_tx")(phi_tx.shape)
theano.printing.Print("c")(c.shape)
theano.printing.Print("cm")(cm.shape)
masked_c = (c * cm[:, :, None])
masked_c = masked_c.dimshuffle(1, 0, 2)
theano.printing.Print("m_c")(masked_c.shape)
theano.printing.Print("m_n")(phi_tx[:, :, None].shape)
window_t = (phi_tx[:, :, None] * masked_c).sum(axis=1)
theano.printing.Print("window_t")(window_t.shape)
proj_w = tensor.dot(window_t, W_ch) + b_ch
h_ti = tensor.tanh(x_t + tensor.dot(h_tm1, U) + proj_w)
# Masking
h_t = xm_t[:, None] * h_ti + (1. - xm_t[:, None]) * h_tm1
return h_t, kappa_t, alpha_t, beta_t, phi_tx, mixture_t, window_t
"""
rvals1 = step(proj_X[0], X_mask[0],
initial_hidden, initial_kappa,
y, y_mask,
W_hidden_hidden, W_window, b_window, W_cond_hidden, b_cond)
h1, k1, a1, b1, p1, w1 = rvals1
rvals2 = step(proj_X[1], X_mask[1],
h1, k1,
y, y_mask,
W_hidden_hidden, W_window, b_window, W_cond_hidden, b_cond)
h2, k2, a2, b2, p2, w2 = rvals2
params = [W_input_hidden, b_input, W_cond_hidden, b_cond,
W_hidden_hidden, W_window, b_window, W_hidden_output, b_output]
return X, h2, k2, a2, b2, p2, w2, params
"""
steps = tensor.arange(y.shape[0], dtype=theano.config.floatX)
theano.printing.Print("steps")(steps.shape)
rvals, updates = theano.scan(step,
sequences=[proj_X[:-1], X_mask[:-1]],
outputs_info=[initial_hidden, initial_kappa,
None, None, None, None, None],
non_sequences=[y, y_mask,
W_hidden_hidden,
W_window, b_window,
W_cond_hidden, b_cond,
steps])
h, k, a, b, p, m, w = rvals
hidden_proj = tensor.dot(h, W_hidden_output) + b_output
theano.printing.Print("hp")(hidden_proj.shape)
params = [W_input_hidden, b_input, W_cond_hidden, b_cond,
W_hidden_hidden, W_window, b_window, W_hidden_output, b_output]
return X, hidden_proj, k, a, b, p, m, w, params
if __name__ == "__main__":
base_string = "cat"
import itertools
true_strings = sorted(list(set(["".join(i) for i in [
s for s in itertools.permutations(base_string)]])))
print(true_strings)
dataset_size = len(true_strings)
minibatch_size = 2
if dataset_size % int(minibatch_size) != 0:
raise ValueError(
"Minibatch size (%s) not an even multiple " % minibatch_size +
"of dataset size (%s). Handle it!" % dataset_size)
m = np.arange(0, dataset_size, minibatch_size)
minibatch_indices = list(zip(m[:-1], m[1:])) + [(m[-1], dataset_size)]
X, X_mask, y, y_mask = make_minibatch_from_strings(true_strings)
y = index_to_onehot(y)
y = y.astype(theano.config.floatX)
X = X.astype(theano.config.floatX)
X_sym = tensor.tensor3('X')
X_mask_sym = tensor.matrix('X_mask')
y_sym = tensor.tensor3('y')
y_mask_sym = tensor.matrix('y_mask')
i, j = minibatch_indices[0]
X_sym.tag.test_value = X[:, i:j]
X_mask_sym.tag.test_value = X_mask[:, i:j]
y_sym.tag.test_value = y[:, i:j]
y_mask_sym.tag.test_value = y_mask[:, i:j]
X_res, hidden_proj, k, a, b, p, m, w, params = build_model(
X_sym, X_mask_sym, y_sym, y_mask_sym,
minibatch_size,
X.shape[-1], y.shape[-1],
256, X.shape[-1], n_gaussians=2)
predict = tensor.nnet.sigmoid(hidden_proj)
true = X_sym[1:]
cost = (-true * tensor.log(predict) - (1 - true) * tensor.log(
1. - predict)).sum(axis=2)
cost = cost.sum(axis=0).mean()
grads = tensor.grad(cost, wrt=params)
opt = adadelta(params)
train = theano.function(inputs=[X_sym, X_mask_sym, y_sym, y_mask_sym],
outputs=cost,
updates=opt.updates(params, grads))
pred = theano.function(inputs=[X_sym, X_mask_sym, y_sym, y_mask_sym],
outputs=[predict, k, a, b, p, m, w])
def plot_items(cost, ret, title, base_offset):
pr, kappa, alpha, beta, phi, mixture, window = ret
for x in range(minibatch_size):
ts = time.time()
f, axarr = plt.subplots(3, sharex=True)
axarr[0].matshow(pr[:, x, ::-1].T, cmap="gray")
axarr[1].matshow(alpha[:, x, :].T)
for k in range(kappa.shape[-1]):
x_offset = np.arange(len(kappa)) - 0.5
axarr[2].plot(x_offset, np.log(kappa[:, x, k]),
label="N(%i)" % k)
axarr[2].legend(loc=4)
if cost is not None:
plt.suptitle("%s, cost %f" % (title, cost))
#f.subplots_adjust(hspace=0)
plt.setp([a.get_xticklabels() for a in f.axes[:-1]],
visible=False)
plt.savefig("%s_%s_%s.png" % (base_offset + x, title, ts))
plt.close()
n_epochs = 2000
for e in range(n_epochs):
costs = []
for n, (i, j) in enumerate(minibatch_indices):
train_cost = train(X[:, i:j], X_mask[:, i:j],
y[:, i:j], y_mask[:, i:j])
costs.append(train_cost)
if e % 250 == 0 or e == (n_epochs - 1):
print("Iteration %i, mb %i:" % (e, n))
mean_cost = np.mean(costs)
print(mean_cost)
ret = pred(X[:, i:j], X_mask[:, i:j], y[:, i:j], y_mask[:, i:j])
pr, kappa, alpha, beta, phi, mixture, window = ret
plot_items(mean_cost, ret, "forced", i)
print("Final cost")
mean_cost = np.mean(costs)
print(mean_cost)
for n, (i, j) in enumerate(minibatch_indices):
pred_X = np.zeros_like(X[:, i:j])
start_slice = 2
pred_X[:start_slice] = X[:start_slice, i:j]
pred_X[:start_slice] = X[:start_slice, i:j]
sub_y = np.zeros_like(y[:, i:j])
sub_y[:] = y[:, i:j]
sub_y_mask = np.zeros_like(y_mask[:, i:j])
sub_y_mask[:] = y_mask[:, i:j]
# use all ones mask since we have no idea in generation how long it is
sub_X_mask = np.ones_like(X_mask[:, i:j])
for ii in np.arange(start_slice, len(X) - 1):
ret = pred(pred_X[:ii], sub_X_mask[:ii],
sub_y, sub_y_mask)
pr, kappa, alpha, beta, phi, mixture, window = ret
pr[pr < 0.5] = 0
pr[pr >= 0.5] = 1
pred_X[ii + 1, :] = pr[-1]
ret = pred(pred_X, sub_X_mask, sub_y, sub_y_mask)
plot_items(mean_cost, ret, "self", i)
from IPython import embed
embed()
raise ValueError()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment