jiqiujia · February 15, 2017 15:07
diff --git a/layers.py b/layers.py
 class BatchNormLayer(lasagne.layers.Layer):
    def __init__(self, incoming, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.), 
                 W=lasagne.init.Normal(0.05), nonlinearity=relu, **kwargs):
        super(BatchNormLayer, self).__init__(incoming, **kwargs)
        self.nonlinearity = nonlinearity
        k = self.input_shape[1]
        if b is not None:
            self.b = self.add_param(b, (k,), name="b", regularizable=False)
        if g is not None:
            self.g = self.add_param(g, (k,), name="g")
        self.avg_batch_mean = self.add_param(lasagne.init.Constant(0.), (k,), name="avg_batch_mean", regularizable=False, trainable=False)
        self.avg_batch_var = self.add_param(lasagne.init.Constant(1.), (k,), name="avg_batch_var", regularizable=False, trainable=False)
        incoming.W.set_value(W.sample(incoming.W.get_value().shape))
        if len(self.input_shape)==4:
            self.axes_to_sum = (0,2,3)
            self.dimshuffle_args = ['x',0,'x','x']
        else:
            self.axes_to_sum = 0
            self.dimshuffle_args = ['x',0]

    def get_output_for(self, input, deterministic=False, **kwargs):
        if deterministic:
            norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args)
        else:
            batch_mean = T.mean(input,axis=self.axes_to_sum).flatten()
            centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args)
            batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten()
            batch_stdv = T.sqrt(1e-6 + batch_var)
            norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args)
            
            # BN updates
            new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean
            new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1.), th.config.floatX)*batch_var
            self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)]
            
        if hasattr(self, 'g'):
            activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args)
        else:
            activation = norm_features
        if hasattr(self, 'b'):
            activation += self.b.dimshuffle(*self.dimshuffle_args)
            
        return self.nonlinearity(activation)
   

 # T.nnet.relu has some issues with very large inputs, this is more stable
 def relu(x):
    return T.maximum(x, 0)

 def batch_norm(layer, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.), **kwargs):
    """
    adapted from https://gist.github.com/f0k/f1a6bd3c8585c400c190
    Convenience function to apply batch normalization to a given layer's output.
    Will steal the layer's nonlinearity if there is one (effectively introducing
    the normalization right before the nonlinearity), and will remove the
    layer's bias if there is one (because it would be redundant).
    """
    nonlinearity = getattr(layer, 'nonlinearity', None)
    if nonlinearity is not None:
        layer.nonlinearity = lasagne.nonlinearities.identity
    if hasattr(layer, 'b'):
        del layer.params[layer.b]
        layer.b = None
    return BatchNormLayer(layer, b, g, nonlinearity=nonlinearity, **kwargs)

 
 class WeightNormLayer(lasagne.layers.Layer):
    def __init__(self, incoming, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.),
                 W=lasagne.init.Normal(0.05), nonlinearity=relu, **kwargs):
        super(WeightNormLayer, self).__init__(incoming, **kwargs)
        self.nonlinearity = nonlinearity
        k = self.input_shape[1]
        if b is not None:
            self.b = self.add_param(b, (k,), name="b", regularizable=False)
        if g is not None:
            self.g = self.add_param(g, (k,), name="g")
        if len(self.input_shape)==4:
            self.axes_to_sum = (0,2,3)
            self.dimshuffle_args = ['x',0,'x','x']
        else:
            self.axes_to_sum = 0
            self.dimshuffle_args = ['x',0]
        
        # scale weights in layer below
        incoming.W_param = incoming.W
        incoming.W_param.set_value(W.sample(incoming.W_param.get_value().shape))
        if incoming.W_param.ndim==4:
            W_axes_to_sum = (1,2,3)
            W_dimshuffle_args = [0,'x','x','x']
        else:
            W_axes_to_sum = 0
            W_dimshuffle_args = ['x',0]
        if g is not None:
            incoming.W = incoming.W_param * (self.g/T.sqrt(T.sum(T.square(incoming.W_param),axis=W_axes_to_sum))).dimshuffle(*W_dimshuffle_args)
        else:
            incoming.W = incoming.W_param / T.sqrt(T.sum(T.square(incoming.W_param),axis=W_axes_to_sum,keepdims=True))        

    def get_output_for(self, input, init=False, **kwargs):
        if init:
            m = T.mean(input, self.axes_to_sum)
            input -= m.dimshuffle(*self.dimshuffle_args)
            stdv = T.sqrt(T.mean(T.square(input),axis=self.axes_to_sum))
            input /= stdv.dimshuffle(*self.dimshuffle_args)
            #initialize `b` and `g`
            self.init_updates = [(self.b, -m/stdv), (self.g, self.g/stdv)]
        elif hasattr(self,'b'):
            input += self.b.dimshuffle(*self.dimshuffle_args)
            
        return self.nonlinearity(input)
        
 def weight_norm(layer, **kwargs):
    nonlinearity = getattr(layer, 'nonlinearity', None)
    if nonlinearity is not None:
        layer.nonlinearity = lasagne.nonlinearities.identity
    if hasattr(layer, 'b'):
        del layer.params[layer.b]
        layer.b = None
    return WeightNormLayer(layer, nonlinearity=nonlinearity, **kwargs)


diff --git a/lcn.py b/lcn.py
 #from https://github.com/jostosh/theano_utils/blob/master/lcn.py
 class LecunLCN(object):
 def __init__(self, X, image_shape, threshold=1e-4, radius=9, use_divisor=True):
    """
    Allocate an LCN.

    :type X: theano.tensor.dtensor4
    :param X: symbolic image tensor, of shape image_shape

    :type image_shape: tuple or list of length 4
    :param image_shape: (batch size, num input feature maps,
                         image height, image width)
    :type threshold: double
    :param threshold: the threshold will be used to avoid division by zeros

    :type radius: int
    :param radius: determines size of Gaussian filter patch (default 9x9)

    :type use_divisor: Boolean
    :param use_divisor: whether or not to apply divisive normalization
    """

    # Get Gaussian filter
    filter_shape = (1, image_shape[1], radius, radius)

    self.filters = theano.shared(self.gaussian_filter(filter_shape), borrow=True)

    # Compute the Guassian weighted average by means of convolution
    convout = conv.conv2d(
        input=X,
        filters=self.filters,
        image_shape=image_shape,
        filter_shape=filter_shape,
        border_mode='full'
    )

    # Subtractive step
    mid = int(numpy.floor(filter_shape[2] / 2.))

    # Make filter dimension broadcastable and subtract
    centered_X = X - T.addbroadcast(convout[:, :, mid:-mid, mid:-mid], 1)

    # Boolean marks whether or not to perform divisive step
    if use_divisor:
        # Note that the local variances can be computed by using the centered_X
        # tensor. If we convolve this with the mean filter, that should give us
        # the variance at each point. We simply take the square root to get our
        # denominator

        # Compute variances
        sum_sqr_XX = conv.conv2d(
            input=T.sqr(centered_X),
            filters=self.filters,
            image_shape=image_shape,
            filter_shape=filter_shape,
            border_mode='full'
        )


        # Take square root to get local standard deviation
        denom = T.sqrt(sum_sqr_XX[:,:,mid:-mid,mid:-mid])

        per_img_mean = denom.mean(axis=[2,3])
        divisor = T.largest(per_img_mean.dimshuffle(0, 1, 'x', 'x'), denom)
        # Divisise step
        new_X = centered_X / T.maximum(T.addbroadcast(divisor, 1), threshold)
    else:
        new_X = centered_X

    self.output = new_X


 def gaussian_filter(self, kernel_shape):
    x = numpy.zeros(kernel_shape, dtype=theano.config.floatX)

    def gauss(x, y, sigma=2.0):
        Z = 2 * numpy.pi * sigma ** 2
        return  1. / Z * numpy.exp(-(x ** 2 + y ** 2) / (2. * sigma ** 2))

    mid = numpy.floor(kernel_shape[-1] / 2.)
    for kernel_idx in xrange(0, kernel_shape[1]):
        for i in xrange(0, kernel_shape[2]):
            for j in xrange(0, kernel_shape[3]):
                x[0, kernel_idx, i, j] = gauss(i - mid, j - mid)

    return x / numpy.sum(x)
diff --git a/losses.py b/losses.py
 def get_klloss(n, margin, prediction, target_var):
    '''
    implementation of paper 'Neural network-based clustering using pairwise constraints'
    params:
        n: batchsize
        margin: margin for kl divergence
        prediction: softmax output
        target_var: ground truth vector
    '''
    log_prediction = T.log(prediction)
    prediction = prediction.reshape((n, 1, -1))
    logpred1 = log_prediction.reshape((n, 1, -1))
    logpred2 = log_prediction.reshape((1, n ,-1))
    logpred_m = logpred1 - logpred2
    kl = prediction * logpred_m
    kl = T.sum(kl, 2)

    id1 = target_var.reshape((-1,1))
    id2 = target_var.reshape((1,-1))
    sim = (id1-id2)
    idxes = T.neq(sim, 0).nonzero()
    kl = T.set_subtensor(kl[idxes], T.maximum(margin-kl[idxes],0))   
    return T.mean(kl)

 def wst_loss(ypred, gt, M, lbda, it_num):
    n = gt.shape[0]
    k = M.shape[0]
    ylabel = T.zeros((n,k))
    ylabel = T.set_subtensor(ylabel[T.arange(n), gt], 1)
    u = T.ones((n,k), dtype='float32')
    v = T.ones((n,k), dtype='float32')

    K = T.exp(-lbda * M - 1)
    KM = K * M

    def fn(u, v, ypred, ylabel, K):
        v = ylabel / T.dot(u, K)
        u = ypred / T.dot(v, K)
        return u,v
    #def fn(u, ypred, ylabel, K):
    #    u = ypred / T.dot((ylabel / T.dot(u, K)), K)
    #    return u
    results, _ = theano.scan(fn, outputs_info = [u,v],
            non_sequences=[ypred,ylabel,K], n_steps=it_num)
    u = results[0][-1]
    v = results[1][-1]
    #u = results[-1]
    #v = ylabel / T.dot(u, K)
    loss = T.sum(v * T.dot(u, KM))
    # entropy term
    epsilon = 1e-7
    u = T.clip(u, epsilon, 1-epsilon)
    v = T.clip(v, epsilon, 1-epsilon)
    loss += 1/lbda * T.sum(T.dot(u * T.log(u), K) * v)
    loss += 1/lbda * T.sum(T.dot(v * T.log(v), K) * u)
    loss += 1/lbda * T.sum(T.dot(u, K * T.log(K)) * v)

    return loss / n 
diff --git a/multiprocessing_test.py b/multiprocessing_test.py
 from  multiprocessing import Pool
 import sys
 sys.path.append('/media/dl/data2/cifar10_resnet/') 
 from utils import *

 def gg(aa, bb):
    # theano module shall be imported here, otherwise MemoryError will occur
    import theano
    import theano.tensor as T
    a = T.scalar(name='a', dtype='float32')
    b = T.scalar(name='b', dtype='float32')
    c = a+b
    f = theano.function([a,b], c)
    return f(aa,bb)
 if __name__ == '__main__':
    pool = Pool(processes=2)
    pool.apply_async(gg, (1.0,2.0))
    #r = pool.apply_async(train_cls, (data, cls_target_var, cls_train_fn, cls_val_fn))
    
    #kl_label = pool.apply_async(model_kl, (data,), kwargs)
    pool.close()
    pool.join()
diff --git a/optimizers.py b/optimizers.py
 def adamax_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
    updates = []
    grads = T.grad(cost, params)
    for p, g in zip(params, grads):
        mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
        v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
        if mom1>0:
            v_t = mom1*v + (1. - mom1)*g
            updates.append((v,v_t))
        else:
            v_t = g
        mg_t = T.maximum(mom2*mg, abs(g))
        g_t = v_t / (mg_t + 1e-6)
        p_t = p - lr * g_t
        updates.append((mg, mg_t))
        updates.append((p, p_t))
    return updates

 def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
    updates = []
    grads = T.grad(cost, params)
    t = th.shared(np.cast[th.config.floatX](1.))
    for p, g in zip(params, grads):
        v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
        mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
        v_t = mom1*v + (1. - mom1)*g
        mg_t = mom2*mg + (1. - mom2)*T.square(g)
        v_hat = v_t / (1. - mom1 ** t)
        mg_hat = mg_t / (1. - mom2 ** t)
        g_t = v_hat / T.sqrt(mg_hat + 1e-8)
        p_t = p - lr * g_t
        updates.append((v, v_t))
        updates.append((mg, mg_t))
        updates.append((p, p_t))
    updates.append((t, t+1))
    return updates
diff --git a/pdist.py b/pdist.py
 import theano
 import theano.tensor as T
 X = T.fmatrix('X')
 Y = T.fmatrix('Y')
 P = T.scalar('P')
 translation_vectors = X.reshape((X.shape[0], 1, -1)) - Y.reshape((1, Y.shape[0], -1))
 minkowski_distances = (abs(translation_vectors) ** P).sum(2) ** (1. / P)
 f_minkowski = theano.function([X, Y, P], minkowski_distances)
diff --git a/preprocessing.py b/preprocessing.py
 class ZCA(object):
    from scipy import linalg
    def __init__(self, regularization=1e-5, x=None):
        self.regularization = regularization
        if x is not None:
            self.fit(x)

    def fit(self, x):
        s = x.shape
        x = x.copy().reshape((s[0],np.prod(s[1:])))
        m = np.mean(x, axis=0)
        x -= m
        sigma = np.dot(x.T,x) / x.shape[0]
        U, S, V = linalg.svd(sigma)
        tmp = np.dot(U, np.diag(1./np.sqrt(S+self.regularization)))
        tmp2 = np.dot(U, np.diag(np.sqrt(S+self.regularization)))
        self.ZCA_mat = th.shared(np.dot(tmp, U.T).astype(th.config.floatX))
        self.inv_ZCA_mat = th.shared(np.dot(tmp2, U.T).astype(th.config.floatX))
        self.mean = th.shared(m.astype(th.config.floatX))

    def apply(self, x):
        s = x.shape
        if isinstance(x, np.ndarray):
            return np.dot(x.reshape((s[0],np.prod(s[1:]))) - self.mean.get_value(), self.ZCA_mat.get_value()).reshape(s)
        elif isinstance(x, T.TensorVariable):
            return T.dot(x.flatten(2) - self.mean.dimshuffle('x',0), self.ZCA_mat).reshape(s)
        else:
            raise NotImplementedError("Whitening only implemented for numpy arrays or Theano TensorVariables")
            
    def invert(self, x):
        s = x.shape
        if isinstance(x, np.ndarray):
            return (np.dot(x.reshape((s[0],np.prod(s[1:]))), self.inv_ZCA_mat.get_value()) + self.mean.get_value()).reshape(s)
        elif isinstance(x, T.TensorVariable):
            return (T.dot(x.flatten(2), self.inv_ZCA_mat) + self.mean.dimshuffle('x',0)).reshape(s)
        else:
            raise NotImplementedError("Whitening only implemented for numpy arrays or Theano TensorVariables")
diff --git a/scan.py b/scan.py
 # -*- coding: utf-8 -*-

 import theano
 import theano.tensor as T
 import numpy as np
 from theano import OrderedUpdates

 #a = T.ivector('a')
 #b = theano.shared(np.array([1,2]), 'int32')
 #loss = 0
 #c = theano.shared(1)
 #for i in np.arange(2):
 #    bn = T.set_subtensor(b[i], b[i]**2)
 #    loss += b[i] * a[i]
 #f = theano.function([a], [b, c,loss], updates=((c,c+1), (b,bn)))
 #av = np.array([1,2]).astype('int32')
 #print f(av)
 #print f(av)
 ############################################################################
 a = theano.shared(np.array([[1,2], [3,4]], dtype='float32'))
 b = T.fscalar('b')
 def loss(i, p):
    an = T.set_subtensor(a[i], a[i]**2)
    p += T.sum(an[i])
    #return (p, OrderedUpdates([(a, an)]))
    return (p, {a: an})

 results, updates = theano.scan(fn=loss, sequences=T.arange(2), outputs_info=b)
 fun = theano.function([b], [a, results[-1]], updates=updates)
 print fun(1)
 print fun(1)
 ############################################################################
 a = theano.shared(np.array([[1,2], [3,4]], dtype='float32'))
 b = T.fscalar('b')
 def loss(i, p, a):
    an = T.set_subtensor(a[i], a[i]**2)
    p += T.sum(an[i])
    #return (p, OrderedUpdates([(a, an)]))
    return (p, an)

 results, updates = theano.scan(fn=loss, sequences=T.arange(2), outputs_info=[b, a])
 p = results[0][-1]
 updates = {a: results[1][-1]}
 fun = theano.function([b], [a, p], updates=updates)
 print fun(1)
 print fun(1)
diff --git a/test_crossentropy.py b/test_crossentropy.py
 # -*- coding: utf-8 -*-

 import theano
 import theano.tensor as T
 import numpy as np

 #x,y=T.matrices('xy')
 #
 ## regular softmax and crossentropy
 ##sm = T.nnet.softmax(x)
 #
 ##The problem goes away if you explicitly write out the softmax function instead of using Theano's:
 #sm = T.exp(x)/(T.exp(x).sum(1,keepdims=True))
 #
 #cm1=T.nnet.categorical_crossentropy(sm,y)
 #g1 = T.grad(cm1.mean(),x)
 #
 ## numerically stable log-softmax with crossentropy
 #xdev = x-x.max(1,keepdims=True)
 #lsm = xdev - T.log(T.sum(T.exp(xdev),axis=1,keepdims=True))
 #sm2 = T.exp(lsm) # just used to show equivalence with sm
 #cm2=-T.sum(y*lsm,axis=1)
 #g2 = T.grad(cm2.mean(),x)
 #
 #
 ## create some inputs into a softmax that are large and labels
 #a=np.exp(10*np.random.rand(5,10).astype(theano.config.floatX))
 ## create some one-hot coded labels
 #b=np.eye(5,10).astype(theano.config.floatX)
 ################################################################################
 #There is an optimization that stabilizes the expression, 
 #but I think it gets triggered only if the target y is expressed as a vector of indices, not as a matrix of one-hot vectors.
 x=T.matrix('x')
 y=T.ivector('y')

 # regular softmax and crossentropy
 sm = T.nnet.softmax(x)
 cm1=T.nnet.categorical_crossentropy(sm,y)
 g1 = T.grad(cm1.mean(),x)

 # numerically stable log-softmax with crossentropy
 xdev = x-x.max(1,keepdims=True)
 lsm = xdev - T.log(T.sum(T.exp(xdev),axis=1,keepdims=True))
 sm2 = T.exp(lsm) # just used to show equivalence with sm
 cm2=-lsm[T.arange(y.shape[0]), y]
 g2 = T.grad(cm2.mean(),x)


 # create some inputs into a softmax that are large and labels
 a=np.exp(10*np.random.rand(5,10).astype(theano.config.floatX))
 # create some one-hot coded labels
 b=np.random.randint(0,10,5).astype(np.uint8)
 ##############################################################################
 # show equivalence of softmax and exponentiated numerically stable log-softmax
 f1=theano.function([x],[sm, sm2])
 sm1,sm2=f1(a)
 print np.allclose(sm1,sm2)

 # now show that the two versions result in the same crossentropy cost
 # this indicates that the forward function does provide some numerical stability
 f2=theano.function([x,y],[cm1,cm2])
 c1,c2 = f2(a,b)
 print np.allclose(c1,c2)

 # now, show that in the standard softmax case the gradients blow up 
 # while in the log-softmax case they don't
 f3=theano.function([x,y],[g1,g2])
 g1_,g2_ = f3(a,b)
 print g1_
 print g2_
	class BatchNormLayer(lasagne.layers.Layer):
	def __init__(self, incoming, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.),
	W=lasagne.init.Normal(0.05), nonlinearity=relu, **kwargs):
	super(BatchNormLayer, self).__init__(incoming, **kwargs)
	self.nonlinearity = nonlinearity
	k = self.input_shape[1]
	if b is not None:
	self.b = self.add_param(b, (k,), name="b", regularizable=False)
	if g is not None:
	self.g = self.add_param(g, (k,), name="g")
	self.avg_batch_mean = self.add_param(lasagne.init.Constant(0.), (k,), name="avg_batch_mean", regularizable=False, trainable=False)
	self.avg_batch_var = self.add_param(lasagne.init.Constant(1.), (k,), name="avg_batch_var", regularizable=False, trainable=False)
	incoming.W.set_value(W.sample(incoming.W.get_value().shape))
	if len(self.input_shape)==4:
	self.axes_to_sum = (0,2,3)
	self.dimshuffle_args = ['x',0,'x','x']
	else:
	self.axes_to_sum = 0
	self.dimshuffle_args = ['x',0]

	def get_output_for(self, input, deterministic=False, **kwargs):
	if deterministic:
	norm_features = (input-self.avg_batch_mean.dimshuffle(self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(self.dimshuffle_args)
	else:
	batch_mean = T.mean(input,axis=self.axes_to_sum).flatten()
	centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args)
	batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten()
	batch_stdv = T.sqrt(1e-6 + batch_var)
	norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args)

	# BN updates
	new_m = 0.9self.avg_batch_mean + 0.1batch_mean
	new_v = 0.9self.avg_batch_var + T.cast((0.1input.shape[0])/(input.shape[0]-1.), th.config.floatX)*batch_var
	self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)]

	if hasattr(self, 'g'):
	activation = norm_featuresself.g.dimshuffle(self.dimshuffle_args)
	else:
	activation = norm_features
	if hasattr(self, 'b'):
	activation += self.b.dimshuffle(*self.dimshuffle_args)

	return self.nonlinearity(activation)


	# T.nnet.relu has some issues with very large inputs, this is more stable
	def relu(x):
	return T.maximum(x, 0)

	def batch_norm(layer, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.), **kwargs):
	"""
	adapted from https://gist.github.com/f0k/f1a6bd3c8585c400c190
	Convenience function to apply batch normalization to a given layer's output.
	Will steal the layer's nonlinearity if there is one (effectively introducing
	the normalization right before the nonlinearity), and will remove the
	layer's bias if there is one (because it would be redundant).
	"""
	nonlinearity = getattr(layer, 'nonlinearity', None)
	if nonlinearity is not None:
	layer.nonlinearity = lasagne.nonlinearities.identity
	if hasattr(layer, 'b'):
	del layer.params[layer.b]
	layer.b = None
	return BatchNormLayer(layer, b, g, nonlinearity=nonlinearity, **kwargs)


	class WeightNormLayer(lasagne.layers.Layer):
	def __init__(self, incoming, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.),
	W=lasagne.init.Normal(0.05), nonlinearity=relu, **kwargs):
	super(WeightNormLayer, self).__init__(incoming, **kwargs)
	self.nonlinearity = nonlinearity
	k = self.input_shape[1]
	if b is not None:
	self.b = self.add_param(b, (k,), name="b", regularizable=False)
	if g is not None:
	self.g = self.add_param(g, (k,), name="g")
	if len(self.input_shape)==4:
	self.axes_to_sum = (0,2,3)
	self.dimshuffle_args = ['x',0,'x','x']
	else:
	self.axes_to_sum = 0
	self.dimshuffle_args = ['x',0]

	# scale weights in layer below
	incoming.W_param = incoming.W
	incoming.W_param.set_value(W.sample(incoming.W_param.get_value().shape))
	if incoming.W_param.ndim==4:
	W_axes_to_sum = (1,2,3)
	W_dimshuffle_args = [0,'x','x','x']
	else:
	W_axes_to_sum = 0
	W_dimshuffle_args = ['x',0]
	if g is not None:
	incoming.W = incoming.W_param * (self.g/T.sqrt(T.sum(T.square(incoming.W_param),axis=W_axes_to_sum))).dimshuffle(*W_dimshuffle_args)
	else:
	incoming.W = incoming.W_param / T.sqrt(T.sum(T.square(incoming.W_param),axis=W_axes_to_sum,keepdims=True))

	def get_output_for(self, input, init=False, **kwargs):
	if init:
	m = T.mean(input, self.axes_to_sum)
	input -= m.dimshuffle(*self.dimshuffle_args)
	stdv = T.sqrt(T.mean(T.square(input),axis=self.axes_to_sum))
	input /= stdv.dimshuffle(*self.dimshuffle_args)
	#initialize `b` and `g`
	self.init_updates = [(self.b, -m/stdv), (self.g, self.g/stdv)]
	elif hasattr(self,'b'):
	input += self.b.dimshuffle(*self.dimshuffle_args)

	return self.nonlinearity(input)

	def weight_norm(layer, **kwargs):
	nonlinearity = getattr(layer, 'nonlinearity', None)
	if nonlinearity is not None:
	layer.nonlinearity = lasagne.nonlinearities.identity
	if hasattr(layer, 'b'):
	del layer.params[layer.b]
	layer.b = None
	return WeightNormLayer(layer, nonlinearity=nonlinearity, **kwargs)
	#from https://github.com/jostosh/theano_utils/blob/master/lcn.py
	class LecunLCN(object):
	def __init__(self, X, image_shape, threshold=1e-4, radius=9, use_divisor=True):
	"""
	Allocate an LCN.

	:type X: theano.tensor.dtensor4
	:param X: symbolic image tensor, of shape image_shape

	:type image_shape: tuple or list of length 4
	:param image_shape: (batch size, num input feature maps,
	image height, image width)
	:type threshold: double
	:param threshold: the threshold will be used to avoid division by zeros

	:type radius: int
	:param radius: determines size of Gaussian filter patch (default 9x9)

	:type use_divisor: Boolean
	:param use_divisor: whether or not to apply divisive normalization
	"""

	# Get Gaussian filter
	filter_shape = (1, image_shape[1], radius, radius)

	self.filters = theano.shared(self.gaussian_filter(filter_shape), borrow=True)

	# Compute the Guassian weighted average by means of convolution
	convout = conv.conv2d(
	input=X,
	filters=self.filters,
	image_shape=image_shape,
	filter_shape=filter_shape,
	border_mode='full'
	)

	# Subtractive step
	mid = int(numpy.floor(filter_shape[2] / 2.))

	# Make filter dimension broadcastable and subtract
	centered_X = X - T.addbroadcast(convout[:, :, mid:-mid, mid:-mid], 1)

	# Boolean marks whether or not to perform divisive step
	if use_divisor:
	# Note that the local variances can be computed by using the centered_X
	# tensor. If we convolve this with the mean filter, that should give us
	# the variance at each point. We simply take the square root to get our
	# denominator

	# Compute variances
	sum_sqr_XX = conv.conv2d(
	input=T.sqr(centered_X),
	filters=self.filters,
	image_shape=image_shape,
	filter_shape=filter_shape,
	border_mode='full'
	)


	# Take square root to get local standard deviation
	denom = T.sqrt(sum_sqr_XX[:,:,mid:-mid,mid:-mid])

	per_img_mean = denom.mean(axis=[2,3])
	divisor = T.largest(per_img_mean.dimshuffle(0, 1, 'x', 'x'), denom)
	# Divisise step
	new_X = centered_X / T.maximum(T.addbroadcast(divisor, 1), threshold)
	else:
	new_X = centered_X

	self.output = new_X


	def gaussian_filter(self, kernel_shape):
	x = numpy.zeros(kernel_shape, dtype=theano.config.floatX)

	def gauss(x, y, sigma=2.0):
	Z = 2 * numpy.pi * sigma ** 2
	return 1. / Z * numpy.exp(-(x 2 + y 2) / (2. * sigma ** 2))

	mid = numpy.floor(kernel_shape[-1] / 2.)
	for kernel_idx in xrange(0, kernel_shape[1]):
	for i in xrange(0, kernel_shape[2]):
	for j in xrange(0, kernel_shape[3]):
	x[0, kernel_idx, i, j] = gauss(i - mid, j - mid)

	return x / numpy.sum(x)
	def get_klloss(n, margin, prediction, target_var):
	'''
	implementation of paper 'Neural network-based clustering using pairwise constraints'
	params:
	n: batchsize
	margin: margin for kl divergence
	prediction: softmax output
	target_var: ground truth vector
	'''
	log_prediction = T.log(prediction)
	prediction = prediction.reshape((n, 1, -1))
	logpred1 = log_prediction.reshape((n, 1, -1))
	logpred2 = log_prediction.reshape((1, n ,-1))
	logpred_m = logpred1 - logpred2
	kl = prediction * logpred_m
	kl = T.sum(kl, 2)

	id1 = target_var.reshape((-1,1))
	id2 = target_var.reshape((1,-1))
	sim = (id1-id2)
	idxes = T.neq(sim, 0).nonzero()
	kl = T.set_subtensor(kl[idxes], T.maximum(margin-kl[idxes],0))
	return T.mean(kl)

	def wst_loss(ypred, gt, M, lbda, it_num):
	n = gt.shape[0]
	k = M.shape[0]
	ylabel = T.zeros((n,k))
	ylabel = T.set_subtensor(ylabel[T.arange(n), gt], 1)
	u = T.ones((n,k), dtype='float32')
	v = T.ones((n,k), dtype='float32')

	K = T.exp(-lbda * M - 1)
	KM = K * M

	def fn(u, v, ypred, ylabel, K):
	v = ylabel / T.dot(u, K)
	u = ypred / T.dot(v, K)
	return u,v
	#def fn(u, ypred, ylabel, K):
	# u = ypred / T.dot((ylabel / T.dot(u, K)), K)
	# return u
	results, _ = theano.scan(fn, outputs_info = [u,v],
	non_sequences=[ypred,ylabel,K], n_steps=it_num)
	u = results[0][-1]
	v = results[1][-1]
	#u = results[-1]
	#v = ylabel / T.dot(u, K)
	loss = T.sum(v * T.dot(u, KM))
	# entropy term
	epsilon = 1e-7
	u = T.clip(u, epsilon, 1-epsilon)
	v = T.clip(v, epsilon, 1-epsilon)
	loss += 1/lbda * T.sum(T.dot(u * T.log(u), K) * v)
	loss += 1/lbda * T.sum(T.dot(v * T.log(v), K) * u)
	loss += 1/lbda * T.sum(T.dot(u, K * T.log(K)) * v)

	return loss / n
	from multiprocessing import Pool
	import sys
	sys.path.append('/media/dl/data2/cifar10_resnet/')
	from utils import *

	def gg(aa, bb):
	# theano module shall be imported here, otherwise MemoryError will occur
	import theano
	import theano.tensor as T
	a = T.scalar(name='a', dtype='float32')
	b = T.scalar(name='b', dtype='float32')
	c = a+b
	f = theano.function([a,b], c)
	return f(aa,bb)
	if __name__ == '__main__':
	pool = Pool(processes=2)
	pool.apply_async(gg, (1.0,2.0))
	#r = pool.apply_async(train_cls, (data, cls_target_var, cls_train_fn, cls_val_fn))

	#kl_label = pool.apply_async(model_kl, (data,), kwargs)
	pool.close()
	pool.join()
	def adamax_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
	updates = []
	grads = T.grad(cost, params)
	for p, g in zip(params, grads):
	mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
	v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
	if mom1>0:
	v_t = mom1v + (1. - mom1)g
	updates.append((v,v_t))
	else:
	v_t = g
	mg_t = T.maximum(mom2*mg, abs(g))
	g_t = v_t / (mg_t + 1e-6)
	p_t = p - lr * g_t
	updates.append((mg, mg_t))
	updates.append((p, p_t))
	return updates

	def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
	updates = []
	grads = T.grad(cost, params)
	t = th.shared(np.cast[th.config.floatX](1.))
	for p, g in zip(params, grads):
	v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
	mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
	v_t = mom1v + (1. - mom1)g
	mg_t = mom2mg + (1. - mom2)T.square(g)
	v_hat = v_t / (1. - mom1 ** t)
	mg_hat = mg_t / (1. - mom2 ** t)
	g_t = v_hat / T.sqrt(mg_hat + 1e-8)
	p_t = p - lr * g_t
	updates.append((v, v_t))
	updates.append((mg, mg_t))
	updates.append((p, p_t))
	updates.append((t, t+1))
	return updates
	import theano
	import theano.tensor as T
	X = T.fmatrix('X')
	Y = T.fmatrix('Y')
	P = T.scalar('P')
	translation_vectors = X.reshape((X.shape[0], 1, -1)) - Y.reshape((1, Y.shape[0], -1))
	minkowski_distances = (abs(translation_vectors) P).sum(2) (1. / P)
	f_minkowski = theano.function([X, Y, P], minkowski_distances)
	class ZCA(object):
	from scipy import linalg
	def __init__(self, regularization=1e-5, x=None):
	self.regularization = regularization
	if x is not None:
	self.fit(x)

	def fit(self, x):
	s = x.shape
	x = x.copy().reshape((s[0],np.prod(s[1:])))
	m = np.mean(x, axis=0)
	x -= m
	sigma = np.dot(x.T,x) / x.shape[0]
	U, S, V = linalg.svd(sigma)
	tmp = np.dot(U, np.diag(1./np.sqrt(S+self.regularization)))
	tmp2 = np.dot(U, np.diag(np.sqrt(S+self.regularization)))
	self.ZCA_mat = th.shared(np.dot(tmp, U.T).astype(th.config.floatX))
	self.inv_ZCA_mat = th.shared(np.dot(tmp2, U.T).astype(th.config.floatX))
	self.mean = th.shared(m.astype(th.config.floatX))

	def apply(self, x):
	s = x.shape
	if isinstance(x, np.ndarray):
	return np.dot(x.reshape((s[0],np.prod(s[1:]))) - self.mean.get_value(), self.ZCA_mat.get_value()).reshape(s)
	elif isinstance(x, T.TensorVariable):
	return T.dot(x.flatten(2) - self.mean.dimshuffle('x',0), self.ZCA_mat).reshape(s)
	else:
	raise NotImplementedError("Whitening only implemented for numpy arrays or Theano TensorVariables")

	def invert(self, x):
	s = x.shape
	if isinstance(x, np.ndarray):
	return (np.dot(x.reshape((s[0],np.prod(s[1:]))), self.inv_ZCA_mat.get_value()) + self.mean.get_value()).reshape(s)
	elif isinstance(x, T.TensorVariable):
	return (T.dot(x.flatten(2), self.inv_ZCA_mat) + self.mean.dimshuffle('x',0)).reshape(s)
	else:
	raise NotImplementedError("Whitening only implemented for numpy arrays or Theano TensorVariables")
	# -- coding: utf-8 --

	import theano
	import theano.tensor as T
	import numpy as np
	from theano import OrderedUpdates

	#a = T.ivector('a')
	#b = theano.shared(np.array([1,2]), 'int32')
	#loss = 0
	#c = theano.shared(1)
	#for i in np.arange(2):
	# bn = T.set_subtensor(b[i], b[i]**2)
	# loss += b[i] * a[i]
	#f = theano.function([a], [b, c,loss], updates=((c,c+1), (b,bn)))
	#av = np.array([1,2]).astype('int32')
	#print f(av)
	#print f(av)
	############################################################################
	a = theano.shared(np.array([[1,2], [3,4]], dtype='float32'))
	b = T.fscalar('b')
	def loss(i, p):
	an = T.set_subtensor(a[i], a[i]**2)
	p += T.sum(an[i])
	#return (p, OrderedUpdates([(a, an)]))
	return (p, {a: an})

	results, updates = theano.scan(fn=loss, sequences=T.arange(2), outputs_info=b)
	fun = theano.function([b], [a, results[-1]], updates=updates)
	print fun(1)
	print fun(1)
	############################################################################
	a = theano.shared(np.array([[1,2], [3,4]], dtype='float32'))
	b = T.fscalar('b')
	def loss(i, p, a):
	an = T.set_subtensor(a[i], a[i]**2)
	p += T.sum(an[i])
	#return (p, OrderedUpdates([(a, an)]))
	return (p, an)

	results, updates = theano.scan(fn=loss, sequences=T.arange(2), outputs_info=[b, a])
	p = results[0][-1]
	updates = {a: results[1][-1]}
	fun = theano.function([b], [a, p], updates=updates)
	print fun(1)
	print fun(1)