Last active
February 15, 2017 15:07
-
-
Save jiqiujia/2e5f21decb1fff9ea2b6f2f9926116ac to your computer and use it in GitHub Desktop.
theano
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class BatchNormLayer(lasagne.layers.Layer): | |
def __init__(self, incoming, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.), | |
W=lasagne.init.Normal(0.05), nonlinearity=relu, **kwargs): | |
super(BatchNormLayer, self).__init__(incoming, **kwargs) | |
self.nonlinearity = nonlinearity | |
k = self.input_shape[1] | |
if b is not None: | |
self.b = self.add_param(b, (k,), name="b", regularizable=False) | |
if g is not None: | |
self.g = self.add_param(g, (k,), name="g") | |
self.avg_batch_mean = self.add_param(lasagne.init.Constant(0.), (k,), name="avg_batch_mean", regularizable=False, trainable=False) | |
self.avg_batch_var = self.add_param(lasagne.init.Constant(1.), (k,), name="avg_batch_var", regularizable=False, trainable=False) | |
incoming.W.set_value(W.sample(incoming.W.get_value().shape)) | |
if len(self.input_shape)==4: | |
self.axes_to_sum = (0,2,3) | |
self.dimshuffle_args = ['x',0,'x','x'] | |
else: | |
self.axes_to_sum = 0 | |
self.dimshuffle_args = ['x',0] | |
def get_output_for(self, input, deterministic=False, **kwargs): | |
if deterministic: | |
norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args) | |
else: | |
batch_mean = T.mean(input,axis=self.axes_to_sum).flatten() | |
centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args) | |
batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten() | |
batch_stdv = T.sqrt(1e-6 + batch_var) | |
norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args) | |
# BN updates | |
new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean | |
new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1.), th.config.floatX)*batch_var | |
self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)] | |
if hasattr(self, 'g'): | |
activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args) | |
else: | |
activation = norm_features | |
if hasattr(self, 'b'): | |
activation += self.b.dimshuffle(*self.dimshuffle_args) | |
return self.nonlinearity(activation) | |
# T.nnet.relu has some issues with very large inputs, this is more stable | |
def relu(x): | |
return T.maximum(x, 0) | |
def batch_norm(layer, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.), **kwargs): | |
""" | |
adapted from https://gist.github.com/f0k/f1a6bd3c8585c400c190 | |
Convenience function to apply batch normalization to a given layer's output. | |
Will steal the layer's nonlinearity if there is one (effectively introducing | |
the normalization right before the nonlinearity), and will remove the | |
layer's bias if there is one (because it would be redundant). | |
""" | |
nonlinearity = getattr(layer, 'nonlinearity', None) | |
if nonlinearity is not None: | |
layer.nonlinearity = lasagne.nonlinearities.identity | |
if hasattr(layer, 'b'): | |
del layer.params[layer.b] | |
layer.b = None | |
return BatchNormLayer(layer, b, g, nonlinearity=nonlinearity, **kwargs) | |
class WeightNormLayer(lasagne.layers.Layer): | |
def __init__(self, incoming, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.), | |
W=lasagne.init.Normal(0.05), nonlinearity=relu, **kwargs): | |
super(WeightNormLayer, self).__init__(incoming, **kwargs) | |
self.nonlinearity = nonlinearity | |
k = self.input_shape[1] | |
if b is not None: | |
self.b = self.add_param(b, (k,), name="b", regularizable=False) | |
if g is not None: | |
self.g = self.add_param(g, (k,), name="g") | |
if len(self.input_shape)==4: | |
self.axes_to_sum = (0,2,3) | |
self.dimshuffle_args = ['x',0,'x','x'] | |
else: | |
self.axes_to_sum = 0 | |
self.dimshuffle_args = ['x',0] | |
# scale weights in layer below | |
incoming.W_param = incoming.W | |
incoming.W_param.set_value(W.sample(incoming.W_param.get_value().shape)) | |
if incoming.W_param.ndim==4: | |
W_axes_to_sum = (1,2,3) | |
W_dimshuffle_args = [0,'x','x','x'] | |
else: | |
W_axes_to_sum = 0 | |
W_dimshuffle_args = ['x',0] | |
if g is not None: | |
incoming.W = incoming.W_param * (self.g/T.sqrt(T.sum(T.square(incoming.W_param),axis=W_axes_to_sum))).dimshuffle(*W_dimshuffle_args) | |
else: | |
incoming.W = incoming.W_param / T.sqrt(T.sum(T.square(incoming.W_param),axis=W_axes_to_sum,keepdims=True)) | |
def get_output_for(self, input, init=False, **kwargs): | |
if init: | |
m = T.mean(input, self.axes_to_sum) | |
input -= m.dimshuffle(*self.dimshuffle_args) | |
stdv = T.sqrt(T.mean(T.square(input),axis=self.axes_to_sum)) | |
input /= stdv.dimshuffle(*self.dimshuffle_args) | |
#initialize `b` and `g` | |
self.init_updates = [(self.b, -m/stdv), (self.g, self.g/stdv)] | |
elif hasattr(self,'b'): | |
input += self.b.dimshuffle(*self.dimshuffle_args) | |
return self.nonlinearity(input) | |
def weight_norm(layer, **kwargs): | |
nonlinearity = getattr(layer, 'nonlinearity', None) | |
if nonlinearity is not None: | |
layer.nonlinearity = lasagne.nonlinearities.identity | |
if hasattr(layer, 'b'): | |
del layer.params[layer.b] | |
layer.b = None | |
return WeightNormLayer(layer, nonlinearity=nonlinearity, **kwargs) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#from https://github.com/jostosh/theano_utils/blob/master/lcn.py | |
class LecunLCN(object): | |
def __init__(self, X, image_shape, threshold=1e-4, radius=9, use_divisor=True): | |
""" | |
Allocate an LCN. | |
:type X: theano.tensor.dtensor4 | |
:param X: symbolic image tensor, of shape image_shape | |
:type image_shape: tuple or list of length 4 | |
:param image_shape: (batch size, num input feature maps, | |
image height, image width) | |
:type threshold: double | |
:param threshold: the threshold will be used to avoid division by zeros | |
:type radius: int | |
:param radius: determines size of Gaussian filter patch (default 9x9) | |
:type use_divisor: Boolean | |
:param use_divisor: whether or not to apply divisive normalization | |
""" | |
# Get Gaussian filter | |
filter_shape = (1, image_shape[1], radius, radius) | |
self.filters = theano.shared(self.gaussian_filter(filter_shape), borrow=True) | |
# Compute the Guassian weighted average by means of convolution | |
convout = conv.conv2d( | |
input=X, | |
filters=self.filters, | |
image_shape=image_shape, | |
filter_shape=filter_shape, | |
border_mode='full' | |
) | |
# Subtractive step | |
mid = int(numpy.floor(filter_shape[2] / 2.)) | |
# Make filter dimension broadcastable and subtract | |
centered_X = X - T.addbroadcast(convout[:, :, mid:-mid, mid:-mid], 1) | |
# Boolean marks whether or not to perform divisive step | |
if use_divisor: | |
# Note that the local variances can be computed by using the centered_X | |
# tensor. If we convolve this with the mean filter, that should give us | |
# the variance at each point. We simply take the square root to get our | |
# denominator | |
# Compute variances | |
sum_sqr_XX = conv.conv2d( | |
input=T.sqr(centered_X), | |
filters=self.filters, | |
image_shape=image_shape, | |
filter_shape=filter_shape, | |
border_mode='full' | |
) | |
# Take square root to get local standard deviation | |
denom = T.sqrt(sum_sqr_XX[:,:,mid:-mid,mid:-mid]) | |
per_img_mean = denom.mean(axis=[2,3]) | |
divisor = T.largest(per_img_mean.dimshuffle(0, 1, 'x', 'x'), denom) | |
# Divisise step | |
new_X = centered_X / T.maximum(T.addbroadcast(divisor, 1), threshold) | |
else: | |
new_X = centered_X | |
self.output = new_X | |
def gaussian_filter(self, kernel_shape): | |
x = numpy.zeros(kernel_shape, dtype=theano.config.floatX) | |
def gauss(x, y, sigma=2.0): | |
Z = 2 * numpy.pi * sigma ** 2 | |
return 1. / Z * numpy.exp(-(x ** 2 + y ** 2) / (2. * sigma ** 2)) | |
mid = numpy.floor(kernel_shape[-1] / 2.) | |
for kernel_idx in xrange(0, kernel_shape[1]): | |
for i in xrange(0, kernel_shape[2]): | |
for j in xrange(0, kernel_shape[3]): | |
x[0, kernel_idx, i, j] = gauss(i - mid, j - mid) | |
return x / numpy.sum(x) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_klloss(n, margin, prediction, target_var): | |
''' | |
implementation of paper 'Neural network-based clustering using pairwise constraints' | |
params: | |
n: batchsize | |
margin: margin for kl divergence | |
prediction: softmax output | |
target_var: ground truth vector | |
''' | |
log_prediction = T.log(prediction) | |
prediction = prediction.reshape((n, 1, -1)) | |
logpred1 = log_prediction.reshape((n, 1, -1)) | |
logpred2 = log_prediction.reshape((1, n ,-1)) | |
logpred_m = logpred1 - logpred2 | |
kl = prediction * logpred_m | |
kl = T.sum(kl, 2) | |
id1 = target_var.reshape((-1,1)) | |
id2 = target_var.reshape((1,-1)) | |
sim = (id1-id2) | |
idxes = T.neq(sim, 0).nonzero() | |
kl = T.set_subtensor(kl[idxes], T.maximum(margin-kl[idxes],0)) | |
return T.mean(kl) | |
def wst_loss(ypred, gt, M, lbda, it_num): | |
n = gt.shape[0] | |
k = M.shape[0] | |
ylabel = T.zeros((n,k)) | |
ylabel = T.set_subtensor(ylabel[T.arange(n), gt], 1) | |
u = T.ones((n,k), dtype='float32') | |
v = T.ones((n,k), dtype='float32') | |
K = T.exp(-lbda * M - 1) | |
KM = K * M | |
def fn(u, v, ypred, ylabel, K): | |
v = ylabel / T.dot(u, K) | |
u = ypred / T.dot(v, K) | |
return u,v | |
#def fn(u, ypred, ylabel, K): | |
# u = ypred / T.dot((ylabel / T.dot(u, K)), K) | |
# return u | |
results, _ = theano.scan(fn, outputs_info = [u,v], | |
non_sequences=[ypred,ylabel,K], n_steps=it_num) | |
u = results[0][-1] | |
v = results[1][-1] | |
#u = results[-1] | |
#v = ylabel / T.dot(u, K) | |
loss = T.sum(v * T.dot(u, KM)) | |
# entropy term | |
epsilon = 1e-7 | |
u = T.clip(u, epsilon, 1-epsilon) | |
v = T.clip(v, epsilon, 1-epsilon) | |
loss += 1/lbda * T.sum(T.dot(u * T.log(u), K) * v) | |
loss += 1/lbda * T.sum(T.dot(v * T.log(v), K) * u) | |
loss += 1/lbda * T.sum(T.dot(u, K * T.log(K)) * v) | |
return loss / n |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from multiprocessing import Pool | |
import sys | |
sys.path.append('/media/dl/data2/cifar10_resnet/') | |
from utils import * | |
def gg(aa, bb): | |
# theano module shall be imported here, otherwise MemoryError will occur | |
import theano | |
import theano.tensor as T | |
a = T.scalar(name='a', dtype='float32') | |
b = T.scalar(name='b', dtype='float32') | |
c = a+b | |
f = theano.function([a,b], c) | |
return f(aa,bb) | |
if __name__ == '__main__': | |
pool = Pool(processes=2) | |
pool.apply_async(gg, (1.0,2.0)) | |
#r = pool.apply_async(train_cls, (data, cls_target_var, cls_train_fn, cls_val_fn)) | |
#kl_label = pool.apply_async(model_kl, (data,), kwargs) | |
pool.close() | |
pool.join() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def adamax_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999): | |
updates = [] | |
grads = T.grad(cost, params) | |
for p, g in zip(params, grads): | |
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) | |
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) | |
if mom1>0: | |
v_t = mom1*v + (1. - mom1)*g | |
updates.append((v,v_t)) | |
else: | |
v_t = g | |
mg_t = T.maximum(mom2*mg, abs(g)) | |
g_t = v_t / (mg_t + 1e-6) | |
p_t = p - lr * g_t | |
updates.append((mg, mg_t)) | |
updates.append((p, p_t)) | |
return updates | |
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999): | |
updates = [] | |
grads = T.grad(cost, params) | |
t = th.shared(np.cast[th.config.floatX](1.)) | |
for p, g in zip(params, grads): | |
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) | |
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) | |
v_t = mom1*v + (1. - mom1)*g | |
mg_t = mom2*mg + (1. - mom2)*T.square(g) | |
v_hat = v_t / (1. - mom1 ** t) | |
mg_hat = mg_t / (1. - mom2 ** t) | |
g_t = v_hat / T.sqrt(mg_hat + 1e-8) | |
p_t = p - lr * g_t | |
updates.append((v, v_t)) | |
updates.append((mg, mg_t)) | |
updates.append((p, p_t)) | |
updates.append((t, t+1)) | |
return updates |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import theano | |
import theano.tensor as T | |
X = T.fmatrix('X') | |
Y = T.fmatrix('Y') | |
P = T.scalar('P') | |
translation_vectors = X.reshape((X.shape[0], 1, -1)) - Y.reshape((1, Y.shape[0], -1)) | |
minkowski_distances = (abs(translation_vectors) ** P).sum(2) ** (1. / P) | |
f_minkowski = theano.function([X, Y, P], minkowski_distances) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ZCA(object): | |
from scipy import linalg | |
def __init__(self, regularization=1e-5, x=None): | |
self.regularization = regularization | |
if x is not None: | |
self.fit(x) | |
def fit(self, x): | |
s = x.shape | |
x = x.copy().reshape((s[0],np.prod(s[1:]))) | |
m = np.mean(x, axis=0) | |
x -= m | |
sigma = np.dot(x.T,x) / x.shape[0] | |
U, S, V = linalg.svd(sigma) | |
tmp = np.dot(U, np.diag(1./np.sqrt(S+self.regularization))) | |
tmp2 = np.dot(U, np.diag(np.sqrt(S+self.regularization))) | |
self.ZCA_mat = th.shared(np.dot(tmp, U.T).astype(th.config.floatX)) | |
self.inv_ZCA_mat = th.shared(np.dot(tmp2, U.T).astype(th.config.floatX)) | |
self.mean = th.shared(m.astype(th.config.floatX)) | |
def apply(self, x): | |
s = x.shape | |
if isinstance(x, np.ndarray): | |
return np.dot(x.reshape((s[0],np.prod(s[1:]))) - self.mean.get_value(), self.ZCA_mat.get_value()).reshape(s) | |
elif isinstance(x, T.TensorVariable): | |
return T.dot(x.flatten(2) - self.mean.dimshuffle('x',0), self.ZCA_mat).reshape(s) | |
else: | |
raise NotImplementedError("Whitening only implemented for numpy arrays or Theano TensorVariables") | |
def invert(self, x): | |
s = x.shape | |
if isinstance(x, np.ndarray): | |
return (np.dot(x.reshape((s[0],np.prod(s[1:]))), self.inv_ZCA_mat.get_value()) + self.mean.get_value()).reshape(s) | |
elif isinstance(x, T.TensorVariable): | |
return (T.dot(x.flatten(2), self.inv_ZCA_mat) + self.mean.dimshuffle('x',0)).reshape(s) | |
else: | |
raise NotImplementedError("Whitening only implemented for numpy arrays or Theano TensorVariables") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import theano | |
import theano.tensor as T | |
import numpy as np | |
from theano import OrderedUpdates | |
#a = T.ivector('a') | |
#b = theano.shared(np.array([1,2]), 'int32') | |
#loss = 0 | |
#c = theano.shared(1) | |
#for i in np.arange(2): | |
# bn = T.set_subtensor(b[i], b[i]**2) | |
# loss += b[i] * a[i] | |
#f = theano.function([a], [b, c,loss], updates=((c,c+1), (b,bn))) | |
#av = np.array([1,2]).astype('int32') | |
#print f(av) | |
#print f(av) | |
############################################################################ | |
a = theano.shared(np.array([[1,2], [3,4]], dtype='float32')) | |
b = T.fscalar('b') | |
def loss(i, p): | |
an = T.set_subtensor(a[i], a[i]**2) | |
p += T.sum(an[i]) | |
#return (p, OrderedUpdates([(a, an)])) | |
return (p, {a: an}) | |
results, updates = theano.scan(fn=loss, sequences=T.arange(2), outputs_info=b) | |
fun = theano.function([b], [a, results[-1]], updates=updates) | |
print fun(1) | |
print fun(1) | |
############################################################################ | |
a = theano.shared(np.array([[1,2], [3,4]], dtype='float32')) | |
b = T.fscalar('b') | |
def loss(i, p, a): | |
an = T.set_subtensor(a[i], a[i]**2) | |
p += T.sum(an[i]) | |
#return (p, OrderedUpdates([(a, an)])) | |
return (p, an) | |
results, updates = theano.scan(fn=loss, sequences=T.arange(2), outputs_info=[b, a]) | |
p = results[0][-1] | |
updates = {a: results[1][-1]} | |
fun = theano.function([b], [a, p], updates=updates) | |
print fun(1) | |
print fun(1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import theano | |
import theano.tensor as T | |
import numpy as np | |
#x,y=T.matrices('xy') | |
# | |
## regular softmax and crossentropy | |
##sm = T.nnet.softmax(x) | |
# | |
##The problem goes away if you explicitly write out the softmax function instead of using Theano's: | |
#sm = T.exp(x)/(T.exp(x).sum(1,keepdims=True)) | |
# | |
#cm1=T.nnet.categorical_crossentropy(sm,y) | |
#g1 = T.grad(cm1.mean(),x) | |
# | |
## numerically stable log-softmax with crossentropy | |
#xdev = x-x.max(1,keepdims=True) | |
#lsm = xdev - T.log(T.sum(T.exp(xdev),axis=1,keepdims=True)) | |
#sm2 = T.exp(lsm) # just used to show equivalence with sm | |
#cm2=-T.sum(y*lsm,axis=1) | |
#g2 = T.grad(cm2.mean(),x) | |
# | |
# | |
## create some inputs into a softmax that are large and labels | |
#a=np.exp(10*np.random.rand(5,10).astype(theano.config.floatX)) | |
## create some one-hot coded labels | |
#b=np.eye(5,10).astype(theano.config.floatX) | |
################################################################################ | |
#There is an optimization that stabilizes the expression, | |
#but I think it gets triggered only if the target y is expressed as a vector of indices, not as a matrix of one-hot vectors. | |
x=T.matrix('x') | |
y=T.ivector('y') | |
# regular softmax and crossentropy | |
sm = T.nnet.softmax(x) | |
cm1=T.nnet.categorical_crossentropy(sm,y) | |
g1 = T.grad(cm1.mean(),x) | |
# numerically stable log-softmax with crossentropy | |
xdev = x-x.max(1,keepdims=True) | |
lsm = xdev - T.log(T.sum(T.exp(xdev),axis=1,keepdims=True)) | |
sm2 = T.exp(lsm) # just used to show equivalence with sm | |
cm2=-lsm[T.arange(y.shape[0]), y] | |
g2 = T.grad(cm2.mean(),x) | |
# create some inputs into a softmax that are large and labels | |
a=np.exp(10*np.random.rand(5,10).astype(theano.config.floatX)) | |
# create some one-hot coded labels | |
b=np.random.randint(0,10,5).astype(np.uint8) | |
############################################################################## | |
# show equivalence of softmax and exponentiated numerically stable log-softmax | |
f1=theano.function([x],[sm, sm2]) | |
sm1,sm2=f1(a) | |
print np.allclose(sm1,sm2) | |
# now show that the two versions result in the same crossentropy cost | |
# this indicates that the forward function does provide some numerical stability | |
f2=theano.function([x,y],[cm1,cm2]) | |
c1,c2 = f2(a,b) | |
print np.allclose(c1,c2) | |
# now, show that in the standard softmax case the gradients blow up | |
# while in the log-softmax case they don't | |
f3=theano.function([x,y],[g1,g2]) | |
g1_,g2_ = f3(a,b) | |
print g1_ | |
print g2_ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment