tkf · March 22, 2019 20:22
diff --git a/.gitignore b/.gitignore
 /.cache
 *.pyc
diff --git a/layer_normalization.py b/layer_normalization.py
 import lasagne


 class LayerNormLayer(lasagne.layers.BatchNormLayer):

    """
    Implementation of Layer Normalization (Ba, Kiros & Hinton, 2016).

    This normalizes input so that it has zero mean and unit variance
    over neurons (as opposed to over batches as in the batch
    normalization).  Since this layer do not have learnable
    parameters, it must be sandwiched by `DenseLayer` and `BiasLayer`
    etc.  See `layer_normalized_dense_layer`.

    The current implementation assumes that the first (0th) axis is
    the batch dimension and other dimensions are used to calculate the
    mean and variance.  In particular, it does not support recurrent
    layers.

    - Ba, Kiros & Hinton (2016) "Layer Normalization."
      http://arxiv.org/abs/1607.06450
    - https://github.com/Lasagne/Lasagne/issues/736#issuecomment-241374360

    """

    def __init__(self, incoming, axes='auto', **kwargs):
        if axes != 'auto':
            kwargs['axes'] = axes

        super(LayerNormLayer, self).__init__(
            incoming,
            beta=None, gamma=None,
            **kwargs)

        if axes == 'auto':
            self.axes = tuple(range(1, len(self.input_shape)))

    def get_output_for(self, input,
                       batch_norm_use_averages=False,
                       batch_norm_update_averages=False,
                       **kwargs):
        return super(LayerNormLayer, self).get_output_for(
            input,
            batch_norm_use_averages=batch_norm_use_averages,
            batch_norm_update_averages=batch_norm_update_averages,
            **kwargs)


 def layer_normalized_dense_layer(incoming, num_units,
                                 nonlinearity=lasagne.nonlinearities.rectify,
                                 W=lasagne.init.Normal(std=1),
                                 b=lasagne.init.Constant(0.),
                                 **kwargs):
    assert num_units > 1
    layer = lasagne.layers.DenseLayer(
        incoming, num_units, W=W, b=None,
        nonlinearity=lasagne.nonlinearities.linear,
        **kwargs)
    layer = LayerNormLayer(layer)
    layer = lasagne.layers.ScaleLayer(layer)
    layer = lasagne.layers.BiasLayer(layer, b=b)
    return lasagne.layers.NonlinearityLayer(layer, nonlinearity=nonlinearity)
diff --git a/LICENSE b/LICENSE
 Copyright 2017, Takafumi Arakaki

 Redistribution and  use in  source and binary  forms, with  or without
 modification, are permitted provided that the following conditions are
 met:

 1.  Redistributions of  source code  must retain  the above  copyright
 notice, this list of conditions and the following disclaimer.

 2.  Redistributions in binary form  must reproduce the above copyright
 notice, this  list of conditions  and the following disclaimer  in the
 documentation and/or other materials provided with the distribution.

 THIS SOFTWARE  IS PROVIDED BY  THE COPYRIGHT HOLDERS  AND CONTRIBUTORS
 "AS  IS" AND  ANY EXPRESS  OR IMPLIED  WARRANTIES, INCLUDING,  BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES  OF MERCHANTABILITY AND FITNESS FOR
 A PARTICULAR PURPOSE ARE DISCLAIMED.   IN NO EVENT SHALL THE COPYRIGHT
 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT  NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE  GOODS OR SERVICES; LOSS OF USE,
 DATA, OR PROFITS; OR BUSINESS  INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF  LIABILITY, WHETHER IN  CONTRACT, STRICT LIABILITY,  OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN  ANY WAY OUT OF THE USE
 OF THIS SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY  OF SUCH DAMAGE.
diff --git a/recurrent_layer_normalization.py b/recurrent_layer_normalization.py
 import lasagne
 import theano


 class LayerNormalization(object):

    def __init__(self, num_units,
                 nonlinearity=lasagne.nonlinearities.rectify,
                 b=lasagne.init.Constant(0.),
                 g=lasagne.init.Constant(1.),
                 eps=1e-5):
        self.num_units = num_units
        self.b = theano.shared(b.sample(num_units), name='layer_norm.b')
        self.g = theano.shared(g.sample(num_units), name='layer_norm.g')
        self.eps = eps
        self.nonlinearity = nonlinearity

    def normalizing_nonlinearity(self, x):
        mean = x.mean(-1, keepdims=True)
        sigma = theano.tensor.sqrt(x.var(-1, keepdims=True) + self.eps)
        b = self.b.reshape((1,) * (x.ndim - 1) + (-1,))
        g = self.g.reshape((1,) * (x.ndim - 1) + (-1,))
        return self.nonlinearity(g * (x - mean) / sigma + b)

    def register_to(self, layer):
        layer.add_param(self.b, (self.num_units,))
        layer.add_param(self.g, (self.num_units,))


 class RecurrentNormalizingLayer(lasagne.layers.RecurrentLayer):

    def __init__(self, incoming, num_units,
                 W_hid_to_hid=lasagne.init.Uniform(1e-4),
                 b=lasagne.init.Uniform(0.1),
                 g=lasagne.init.Constant(1.),
                 hid_init=lasagne.init.Uniform(0.1),
                 nonlinearity=lasagne.nonlinearities.rectify,
                 eps=0.05,
                 **kwargs):
        self.layer_normalization = LayerNormalization(
            num_units, nonlinearity=nonlinearity, b=b, g=g, eps=eps)
        super(RecurrentNormalizingLayer, self).__init__(
            incoming, num_units,
            W_hid_to_hid=W_hid_to_hid,
            b=None,
            hid_init=hid_init,
            nonlinearity=self.layer_normalization.normalizing_nonlinearity,
            **kwargs)
        self.layer_normalization.register_to(self.hidden_to_hidden)
diff --git a/test_layer_normalization.py b/test_layer_normalization.py
 import lasagne
 import numpy as np
 import pytest

 from layer_normalization import LayerNormLayer, layer_normalized_dense_layer


 def np_norm_layer(x, epsilon=0):
    kwds = dict(axis=tuple(range(1, len(x.shape))), keepdims=True)
    mean = x.mean(**kwds)
    std = np.sqrt(x.var(**kwds) + epsilon)
    return (x - mean) / std


 @pytest.mark.parametrize('def_shape, real_shape', [
    ((2, 3), None),
    ((None, 3), (2, 3)),
    ((2, 3, 4), None),
    ((None, 3, 4), (2, 3, 4)),
 ])
 def test_layer_norm_layer(def_shape, real_shape):
    l0 = lasagne.layers.InputLayer(def_shape)
    l1 = LayerNormLayer(l0)
    out = lasagne.layers.get_output(l1)

    rs = np.random.RandomState(0)
    x = rs.randn(*(real_shape or def_shape))
    actual = out.eval({l0.input_var: x})

    desired = np_norm_layer(x, l1.epsilon)
    assert desired.shape == x.shape

    np.testing.assert_almost_equal(actual, desired)


 @pytest.mark.parametrize('batchsize, in_dim, out_dim', [
    (2, 3, 4),
    (10, 30, 20),
 ])
 def test_layer_normalized_dense_layer(batchsize, in_dim, out_dim):
    rs = np.random.RandomState(0)
    x = rs.randn(batchsize, in_dim)
    W = rs.randn(in_dim, out_dim)
    b = rs.randn(out_dim)

    l0 = lasagne.layers.InputLayer(x.shape)
    l1 = layer_normalized_dense_layer(l0, out_dim, W=W, b=b)
    for layer in lasagne.layers.get_all_layers(l1):
        if isinstance(layer, LayerNormLayer):
            assert hasattr(layer, 'epsilon')
            layer.epsilon = 0
            break
    else:
        raise ValueError('LayerNormLayer not found')
    out = lasagne.layers.get_output(l1)
    actual = out.eval({l0.input_var: x})

    a = np.tensordot(x, W, axes=1)
    desired = l1.nonlinearity(np_norm_layer(a) + b)

    assert (desired > 0).any()
    np.testing.assert_almost_equal(actual, desired)
	import lasagne


	class LayerNormLayer(lasagne.layers.BatchNormLayer):

	"""
	Implementation of Layer Normalization (Ba, Kiros & Hinton, 2016).

	This normalizes input so that it has zero mean and unit variance
	over neurons (as opposed to over batches as in the batch
	normalization). Since this layer do not have learnable
	parameters, it must be sandwiched by `DenseLayer` and `BiasLayer`
	etc. See `layer_normalized_dense_layer`.

	The current implementation assumes that the first (0th) axis is
	the batch dimension and other dimensions are used to calculate the
	mean and variance. In particular, it does not support recurrent
	layers.

	- Ba, Kiros & Hinton (2016) "Layer Normalization."
	http://arxiv.org/abs/1607.06450
	- https://github.com/Lasagne/Lasagne/issues/736#issuecomment-241374360

	"""

	def __init__(self, incoming, axes='auto', **kwargs):
	if axes != 'auto':
	kwargs['axes'] = axes

	super(LayerNormLayer, self).__init__(
	incoming,
	beta=None, gamma=None,
	**kwargs)

	if axes == 'auto':
	self.axes = tuple(range(1, len(self.input_shape)))

	def get_output_for(self, input,
	batch_norm_use_averages=False,
	batch_norm_update_averages=False,
	**kwargs):
	return super(LayerNormLayer, self).get_output_for(
	input,
	batch_norm_use_averages=batch_norm_use_averages,
	batch_norm_update_averages=batch_norm_update_averages,
	**kwargs)


	def layer_normalized_dense_layer(incoming, num_units,
	nonlinearity=lasagne.nonlinearities.rectify,
	W=lasagne.init.Normal(std=1),
	b=lasagne.init.Constant(0.),
	**kwargs):
	assert num_units > 1
	layer = lasagne.layers.DenseLayer(
	incoming, num_units, W=W, b=None,
	nonlinearity=lasagne.nonlinearities.linear,
	**kwargs)
	layer = LayerNormLayer(layer)
	layer = lasagne.layers.ScaleLayer(layer)
	layer = lasagne.layers.BiasLayer(layer, b=b)
	return lasagne.layers.NonlinearityLayer(layer, nonlinearity=nonlinearity)
	Copyright 2017, Takafumi Arakaki

	Redistribution and use in source and binary forms, with or without
	modification, are permitted provided that the following conditions are
	met:

	1. Redistributions of source code must retain the above copyright
	notice, this list of conditions and the following disclaimer.

	2. Redistributions in binary form must reproduce the above copyright
	notice, this list of conditions and the following disclaimer in the
	documentation and/or other materials provided with the distribution.

	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	import lasagne
	import theano


	class LayerNormalization(object):

	def __init__(self, num_units,
	nonlinearity=lasagne.nonlinearities.rectify,
	b=lasagne.init.Constant(0.),
	g=lasagne.init.Constant(1.),
	eps=1e-5):
	self.num_units = num_units
	self.b = theano.shared(b.sample(num_units), name='layer_norm.b')
	self.g = theano.shared(g.sample(num_units), name='layer_norm.g')
	self.eps = eps
	self.nonlinearity = nonlinearity

	def normalizing_nonlinearity(self, x):
	mean = x.mean(-1, keepdims=True)
	sigma = theano.tensor.sqrt(x.var(-1, keepdims=True) + self.eps)
	b = self.b.reshape((1,) * (x.ndim - 1) + (-1,))
	g = self.g.reshape((1,) * (x.ndim - 1) + (-1,))
	return self.nonlinearity(g * (x - mean) / sigma + b)

	def register_to(self, layer):
	layer.add_param(self.b, (self.num_units,))
	layer.add_param(self.g, (self.num_units,))


	class RecurrentNormalizingLayer(lasagne.layers.RecurrentLayer):

	def __init__(self, incoming, num_units,
	W_hid_to_hid=lasagne.init.Uniform(1e-4),
	b=lasagne.init.Uniform(0.1),
	g=lasagne.init.Constant(1.),
	hid_init=lasagne.init.Uniform(0.1),
	nonlinearity=lasagne.nonlinearities.rectify,
	eps=0.05,
	**kwargs):
	self.layer_normalization = LayerNormalization(
	num_units, nonlinearity=nonlinearity, b=b, g=g, eps=eps)
	super(RecurrentNormalizingLayer, self).__init__(
	incoming, num_units,
	W_hid_to_hid=W_hid_to_hid,
	b=None,
	hid_init=hid_init,
	nonlinearity=self.layer_normalization.normalizing_nonlinearity,
	**kwargs)
	self.layer_normalization.register_to(self.hidden_to_hidden)
	import lasagne
	import numpy as np
	import pytest

	from layer_normalization import LayerNormLayer, layer_normalized_dense_layer


	def np_norm_layer(x, epsilon=0):
	kwds = dict(axis=tuple(range(1, len(x.shape))), keepdims=True)
	mean = x.mean(**kwds)
	std = np.sqrt(x.var(**kwds) + epsilon)
	return (x - mean) / std


	@pytest.mark.parametrize('def_shape, real_shape', [
	((2, 3), None),
	((None, 3), (2, 3)),
	((2, 3, 4), None),
	((None, 3, 4), (2, 3, 4)),
	])
	def test_layer_norm_layer(def_shape, real_shape):
	l0 = lasagne.layers.InputLayer(def_shape)
	l1 = LayerNormLayer(l0)
	out = lasagne.layers.get_output(l1)

	rs = np.random.RandomState(0)
	x = rs.randn(*(real_shape or def_shape))
	actual = out.eval({l0.input_var: x})

	desired = np_norm_layer(x, l1.epsilon)
	assert desired.shape == x.shape

	np.testing.assert_almost_equal(actual, desired)


	@pytest.mark.parametrize('batchsize, in_dim, out_dim', [
	(2, 3, 4),
	(10, 30, 20),
	])
	def test_layer_normalized_dense_layer(batchsize, in_dim, out_dim):
	rs = np.random.RandomState(0)
	x = rs.randn(batchsize, in_dim)
	W = rs.randn(in_dim, out_dim)
	b = rs.randn(out_dim)

	l0 = lasagne.layers.InputLayer(x.shape)
	l1 = layer_normalized_dense_layer(l0, out_dim, W=W, b=b)
	for layer in lasagne.layers.get_all_layers(l1):
	if isinstance(layer, LayerNormLayer):
	assert hasattr(layer, 'epsilon')
	layer.epsilon = 0
	break
	else:
	raise ValueError('LayerNormLayer not found')
	out = lasagne.layers.get_output(l1)
	actual = out.eval({l0.input_var: x})

	a = np.tensordot(x, W, axes=1)
	desired = l1.nonlinearity(np_norm_layer(a) + b)

	assert (desired > 0).any()
	np.testing.assert_almost_equal(actual, desired)