Skip to content

Instantly share code, notes, and snippets.

@jiqiujia
Last active February 15, 2017 15:07
Show Gist options
  • Save jiqiujia/2e5f21decb1fff9ea2b6f2f9926116ac to your computer and use it in GitHub Desktop.
Save jiqiujia/2e5f21decb1fff9ea2b6f2f9926116ac to your computer and use it in GitHub Desktop.
theano
class BatchNormLayer(lasagne.layers.Layer):
def __init__(self, incoming, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.),
W=lasagne.init.Normal(0.05), nonlinearity=relu, **kwargs):
super(BatchNormLayer, self).__init__(incoming, **kwargs)
self.nonlinearity = nonlinearity
k = self.input_shape[1]
if b is not None:
self.b = self.add_param(b, (k,), name="b", regularizable=False)
if g is not None:
self.g = self.add_param(g, (k,), name="g")
self.avg_batch_mean = self.add_param(lasagne.init.Constant(0.), (k,), name="avg_batch_mean", regularizable=False, trainable=False)
self.avg_batch_var = self.add_param(lasagne.init.Constant(1.), (k,), name="avg_batch_var", regularizable=False, trainable=False)
incoming.W.set_value(W.sample(incoming.W.get_value().shape))
if len(self.input_shape)==4:
self.axes_to_sum = (0,2,3)
self.dimshuffle_args = ['x',0,'x','x']
else:
self.axes_to_sum = 0
self.dimshuffle_args = ['x',0]
def get_output_for(self, input, deterministic=False, **kwargs):
if deterministic:
norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args)
else:
batch_mean = T.mean(input,axis=self.axes_to_sum).flatten()
centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args)
batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten()
batch_stdv = T.sqrt(1e-6 + batch_var)
norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args)
# BN updates
new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean
new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1.), th.config.floatX)*batch_var
self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)]
if hasattr(self, 'g'):
activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args)
else:
activation = norm_features
if hasattr(self, 'b'):
activation += self.b.dimshuffle(*self.dimshuffle_args)
return self.nonlinearity(activation)
# T.nnet.relu has some issues with very large inputs, this is more stable
def relu(x):
return T.maximum(x, 0)
def batch_norm(layer, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.), **kwargs):
"""
adapted from https://gist.github.com/f0k/f1a6bd3c8585c400c190
Convenience function to apply batch normalization to a given layer's output.
Will steal the layer's nonlinearity if there is one (effectively introducing
the normalization right before the nonlinearity), and will remove the
layer's bias if there is one (because it would be redundant).
"""
nonlinearity = getattr(layer, 'nonlinearity', None)
if nonlinearity is not None:
layer.nonlinearity = lasagne.nonlinearities.identity
if hasattr(layer, 'b'):
del layer.params[layer.b]
layer.b = None
return BatchNormLayer(layer, b, g, nonlinearity=nonlinearity, **kwargs)
class WeightNormLayer(lasagne.layers.Layer):
def __init__(self, incoming, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.),
W=lasagne.init.Normal(0.05), nonlinearity=relu, **kwargs):
super(WeightNormLayer, self).__init__(incoming, **kwargs)
self.nonlinearity = nonlinearity
k = self.input_shape[1]
if b is not None:
self.b = self.add_param(b, (k,), name="b", regularizable=False)
if g is not None:
self.g = self.add_param(g, (k,), name="g")
if len(self.input_shape)==4:
self.axes_to_sum = (0,2,3)
self.dimshuffle_args = ['x',0,'x','x']
else:
self.axes_to_sum = 0
self.dimshuffle_args = ['x',0]
# scale weights in layer below
incoming.W_param = incoming.W
incoming.W_param.set_value(W.sample(incoming.W_param.get_value().shape))
if incoming.W_param.ndim==4:
W_axes_to_sum = (1,2,3)
W_dimshuffle_args = [0,'x','x','x']
else:
W_axes_to_sum = 0
W_dimshuffle_args = ['x',0]
if g is not None:
incoming.W = incoming.W_param * (self.g/T.sqrt(T.sum(T.square(incoming.W_param),axis=W_axes_to_sum))).dimshuffle(*W_dimshuffle_args)
else:
incoming.W = incoming.W_param / T.sqrt(T.sum(T.square(incoming.W_param),axis=W_axes_to_sum,keepdims=True))
def get_output_for(self, input, init=False, **kwargs):
if init:
m = T.mean(input, self.axes_to_sum)
input -= m.dimshuffle(*self.dimshuffle_args)
stdv = T.sqrt(T.mean(T.square(input),axis=self.axes_to_sum))
input /= stdv.dimshuffle(*self.dimshuffle_args)
#initialize `b` and `g`
self.init_updates = [(self.b, -m/stdv), (self.g, self.g/stdv)]
elif hasattr(self,'b'):
input += self.b.dimshuffle(*self.dimshuffle_args)
return self.nonlinearity(input)
def weight_norm(layer, **kwargs):
nonlinearity = getattr(layer, 'nonlinearity', None)
if nonlinearity is not None:
layer.nonlinearity = lasagne.nonlinearities.identity
if hasattr(layer, 'b'):
del layer.params[layer.b]
layer.b = None
return WeightNormLayer(layer, nonlinearity=nonlinearity, **kwargs)
#from https://github.com/jostosh/theano_utils/blob/master/lcn.py
class LecunLCN(object):
def __init__(self, X, image_shape, threshold=1e-4, radius=9, use_divisor=True):
"""
Allocate an LCN.
:type X: theano.tensor.dtensor4
:param X: symbolic image tensor, of shape image_shape
:type image_shape: tuple or list of length 4
:param image_shape: (batch size, num input feature maps,
image height, image width)
:type threshold: double
:param threshold: the threshold will be used to avoid division by zeros
:type radius: int
:param radius: determines size of Gaussian filter patch (default 9x9)
:type use_divisor: Boolean
:param use_divisor: whether or not to apply divisive normalization
"""
# Get Gaussian filter
filter_shape = (1, image_shape[1], radius, radius)
self.filters = theano.shared(self.gaussian_filter(filter_shape), borrow=True)
# Compute the Guassian weighted average by means of convolution
convout = conv.conv2d(
input=X,
filters=self.filters,
image_shape=image_shape,
filter_shape=filter_shape,
border_mode='full'
)
# Subtractive step
mid = int(numpy.floor(filter_shape[2] / 2.))
# Make filter dimension broadcastable and subtract
centered_X = X - T.addbroadcast(convout[:, :, mid:-mid, mid:-mid], 1)
# Boolean marks whether or not to perform divisive step
if use_divisor:
# Note that the local variances can be computed by using the centered_X
# tensor. If we convolve this with the mean filter, that should give us
# the variance at each point. We simply take the square root to get our
# denominator
# Compute variances
sum_sqr_XX = conv.conv2d(
input=T.sqr(centered_X),
filters=self.filters,
image_shape=image_shape,
filter_shape=filter_shape,
border_mode='full'
)
# Take square root to get local standard deviation
denom = T.sqrt(sum_sqr_XX[:,:,mid:-mid,mid:-mid])
per_img_mean = denom.mean(axis=[2,3])
divisor = T.largest(per_img_mean.dimshuffle(0, 1, 'x', 'x'), denom)
# Divisise step
new_X = centered_X / T.maximum(T.addbroadcast(divisor, 1), threshold)
else:
new_X = centered_X
self.output = new_X
def gaussian_filter(self, kernel_shape):
x = numpy.zeros(kernel_shape, dtype=theano.config.floatX)
def gauss(x, y, sigma=2.0):
Z = 2 * numpy.pi * sigma ** 2
return 1. / Z * numpy.exp(-(x ** 2 + y ** 2) / (2. * sigma ** 2))
mid = numpy.floor(kernel_shape[-1] / 2.)
for kernel_idx in xrange(0, kernel_shape[1]):
for i in xrange(0, kernel_shape[2]):
for j in xrange(0, kernel_shape[3]):
x[0, kernel_idx, i, j] = gauss(i - mid, j - mid)
return x / numpy.sum(x)
def get_klloss(n, margin, prediction, target_var):
'''
implementation of paper 'Neural network-based clustering using pairwise constraints'
params:
n: batchsize
margin: margin for kl divergence
prediction: softmax output
target_var: ground truth vector
'''
log_prediction = T.log(prediction)
prediction = prediction.reshape((n, 1, -1))
logpred1 = log_prediction.reshape((n, 1, -1))
logpred2 = log_prediction.reshape((1, n ,-1))
logpred_m = logpred1 - logpred2
kl = prediction * logpred_m
kl = T.sum(kl, 2)
id1 = target_var.reshape((-1,1))
id2 = target_var.reshape((1,-1))
sim = (id1-id2)
idxes = T.neq(sim, 0).nonzero()
kl = T.set_subtensor(kl[idxes], T.maximum(margin-kl[idxes],0))
return T.mean(kl)
def wst_loss(ypred, gt, M, lbda, it_num):
n = gt.shape[0]
k = M.shape[0]
ylabel = T.zeros((n,k))
ylabel = T.set_subtensor(ylabel[T.arange(n), gt], 1)
u = T.ones((n,k), dtype='float32')
v = T.ones((n,k), dtype='float32')
K = T.exp(-lbda * M - 1)
KM = K * M
def fn(u, v, ypred, ylabel, K):
v = ylabel / T.dot(u, K)
u = ypred / T.dot(v, K)
return u,v
#def fn(u, ypred, ylabel, K):
# u = ypred / T.dot((ylabel / T.dot(u, K)), K)
# return u
results, _ = theano.scan(fn, outputs_info = [u,v],
non_sequences=[ypred,ylabel,K], n_steps=it_num)
u = results[0][-1]
v = results[1][-1]
#u = results[-1]
#v = ylabel / T.dot(u, K)
loss = T.sum(v * T.dot(u, KM))
# entropy term
epsilon = 1e-7
u = T.clip(u, epsilon, 1-epsilon)
v = T.clip(v, epsilon, 1-epsilon)
loss += 1/lbda * T.sum(T.dot(u * T.log(u), K) * v)
loss += 1/lbda * T.sum(T.dot(v * T.log(v), K) * u)
loss += 1/lbda * T.sum(T.dot(u, K * T.log(K)) * v)
return loss / n 
from multiprocessing import Pool
import sys
sys.path.append('/media/dl/data2/cifar10_resnet/')
from utils import *
def gg(aa, bb):
# theano module shall be imported here, otherwise MemoryError will occur
import theano
import theano.tensor as T
a = T.scalar(name='a', dtype='float32')
b = T.scalar(name='b', dtype='float32')
c = a+b
f = theano.function([a,b], c)
return f(aa,bb)
if __name__ == '__main__':
pool = Pool(processes=2)
pool.apply_async(gg, (1.0,2.0))
#r = pool.apply_async(train_cls, (data, cls_target_var, cls_train_fn, cls_val_fn))
#kl_label = pool.apply_async(model_kl, (data,), kwargs)
pool.close()
pool.join()
def adamax_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
updates = []
grads = T.grad(cost, params)
for p, g in zip(params, grads):
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
if mom1>0:
v_t = mom1*v + (1. - mom1)*g
updates.append((v,v_t))
else:
v_t = g
mg_t = T.maximum(mom2*mg, abs(g))
g_t = v_t / (mg_t + 1e-6)
p_t = p - lr * g_t
updates.append((mg, mg_t))
updates.append((p, p_t))
return updates
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
updates = []
grads = T.grad(cost, params)
t = th.shared(np.cast[th.config.floatX](1.))
for p, g in zip(params, grads):
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
v_t = mom1*v + (1. - mom1)*g
mg_t = mom2*mg + (1. - mom2)*T.square(g)
v_hat = v_t / (1. - mom1 ** t)
mg_hat = mg_t / (1. - mom2 ** t)
g_t = v_hat / T.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append((v, v_t))
updates.append((mg, mg_t))
updates.append((p, p_t))
updates.append((t, t+1))
return updates
import theano
import theano.tensor as T
X = T.fmatrix('X')
Y = T.fmatrix('Y')
P = T.scalar('P')
translation_vectors = X.reshape((X.shape[0], 1, -1)) - Y.reshape((1, Y.shape[0], -1))
minkowski_distances = (abs(translation_vectors) ** P).sum(2) ** (1. / P)
f_minkowski = theano.function([X, Y, P], minkowski_distances)
class ZCA(object):
from scipy import linalg
def __init__(self, regularization=1e-5, x=None):
self.regularization = regularization
if x is not None:
self.fit(x)
def fit(self, x):
s = x.shape
x = x.copy().reshape((s[0],np.prod(s[1:])))
m = np.mean(x, axis=0)
x -= m
sigma = np.dot(x.T,x) / x.shape[0]
U, S, V = linalg.svd(sigma)
tmp = np.dot(U, np.diag(1./np.sqrt(S+self.regularization)))
tmp2 = np.dot(U, np.diag(np.sqrt(S+self.regularization)))
self.ZCA_mat = th.shared(np.dot(tmp, U.T).astype(th.config.floatX))
self.inv_ZCA_mat = th.shared(np.dot(tmp2, U.T).astype(th.config.floatX))
self.mean = th.shared(m.astype(th.config.floatX))
def apply(self, x):
s = x.shape
if isinstance(x, np.ndarray):
return np.dot(x.reshape((s[0],np.prod(s[1:]))) - self.mean.get_value(), self.ZCA_mat.get_value()).reshape(s)
elif isinstance(x, T.TensorVariable):
return T.dot(x.flatten(2) - self.mean.dimshuffle('x',0), self.ZCA_mat).reshape(s)
else:
raise NotImplementedError("Whitening only implemented for numpy arrays or Theano TensorVariables")
def invert(self, x):
s = x.shape
if isinstance(x, np.ndarray):
return (np.dot(x.reshape((s[0],np.prod(s[1:]))), self.inv_ZCA_mat.get_value()) + self.mean.get_value()).reshape(s)
elif isinstance(x, T.TensorVariable):
return (T.dot(x.flatten(2), self.inv_ZCA_mat) + self.mean.dimshuffle('x',0)).reshape(s)
else:
raise NotImplementedError("Whitening only implemented for numpy arrays or Theano TensorVariables")
# -*- coding: utf-8 -*-
import theano
import theano.tensor as T
import numpy as np
from theano import OrderedUpdates
#a = T.ivector('a')
#b = theano.shared(np.array([1,2]), 'int32')
#loss = 0
#c = theano.shared(1)
#for i in np.arange(2):
# bn = T.set_subtensor(b[i], b[i]**2)
# loss += b[i] * a[i]
#f = theano.function([a], [b, c,loss], updates=((c,c+1), (b,bn)))
#av = np.array([1,2]).astype('int32')
#print f(av)
#print f(av)
############################################################################
a = theano.shared(np.array([[1,2], [3,4]], dtype='float32'))
b = T.fscalar('b')
def loss(i, p):
an = T.set_subtensor(a[i], a[i]**2)
p += T.sum(an[i])
#return (p, OrderedUpdates([(a, an)]))
return (p, {a: an})
results, updates = theano.scan(fn=loss, sequences=T.arange(2), outputs_info=b)
fun = theano.function([b], [a, results[-1]], updates=updates)
print fun(1)
print fun(1)
############################################################################
a = theano.shared(np.array([[1,2], [3,4]], dtype='float32'))
b = T.fscalar('b')
def loss(i, p, a):
an = T.set_subtensor(a[i], a[i]**2)
p += T.sum(an[i])
#return (p, OrderedUpdates([(a, an)]))
return (p, an)
results, updates = theano.scan(fn=loss, sequences=T.arange(2), outputs_info=[b, a])
p = results[0][-1]
updates = {a: results[1][-1]}
fun = theano.function([b], [a, p], updates=updates)
print fun(1)
print fun(1)
# -*- coding: utf-8 -*-
import theano
import theano.tensor as T
import numpy as np
#x,y=T.matrices('xy')
#
## regular softmax and crossentropy
##sm = T.nnet.softmax(x)
#
##The problem goes away if you explicitly write out the softmax function instead of using Theano's:
#sm = T.exp(x)/(T.exp(x).sum(1,keepdims=True))
#
#cm1=T.nnet.categorical_crossentropy(sm,y)
#g1 = T.grad(cm1.mean(),x)
#
## numerically stable log-softmax with crossentropy
#xdev = x-x.max(1,keepdims=True)
#lsm = xdev - T.log(T.sum(T.exp(xdev),axis=1,keepdims=True))
#sm2 = T.exp(lsm) # just used to show equivalence with sm
#cm2=-T.sum(y*lsm,axis=1)
#g2 = T.grad(cm2.mean(),x)
#
#
## create some inputs into a softmax that are large and labels
#a=np.exp(10*np.random.rand(5,10).astype(theano.config.floatX))
## create some one-hot coded labels
#b=np.eye(5,10).astype(theano.config.floatX)
################################################################################
#There is an optimization that stabilizes the expression,
#but I think it gets triggered only if the target y is expressed as a vector of indices, not as a matrix of one-hot vectors.
x=T.matrix('x')
y=T.ivector('y')
# regular softmax and crossentropy
sm = T.nnet.softmax(x)
cm1=T.nnet.categorical_crossentropy(sm,y)
g1 = T.grad(cm1.mean(),x)
# numerically stable log-softmax with crossentropy
xdev = x-x.max(1,keepdims=True)
lsm = xdev - T.log(T.sum(T.exp(xdev),axis=1,keepdims=True))
sm2 = T.exp(lsm) # just used to show equivalence with sm
cm2=-lsm[T.arange(y.shape[0]), y]
g2 = T.grad(cm2.mean(),x)
# create some inputs into a softmax that are large and labels
a=np.exp(10*np.random.rand(5,10).astype(theano.config.floatX))
# create some one-hot coded labels
b=np.random.randint(0,10,5).astype(np.uint8)
##############################################################################
# show equivalence of softmax and exponentiated numerically stable log-softmax
f1=theano.function([x],[sm, sm2])
sm1,sm2=f1(a)
print np.allclose(sm1,sm2)
# now show that the two versions result in the same crossentropy cost
# this indicates that the forward function does provide some numerical stability
f2=theano.function([x,y],[cm1,cm2])
c1,c2 = f2(a,b)
print np.allclose(c1,c2)
# now, show that in the standard softmax case the gradients blow up
# while in the log-softmax case they don't
f3=theano.function([x,y],[g1,g2])
g1_,g2_ = f3(a,b)
print g1_
print g2_
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment