justheuristic · October 4, 2017 12:57
diff --git a/Hierarchical softmax layer.ipynb b/Hierarchical softmax layer.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import theano\n",
    "from lasagne.layers import InputLayer,get_output\n",
    "from hierarchical_softmax_layer import HierarchicalSoftmaxDenseLayer\n",
    "\n",
    "def test_h_softmax_layer(n_samples=100,n_classes=1000):\n",
    "    input_X = theano.shared(np.zeros([100,10]))\n",
    "    target_y = theano.shared(np.random.randint(0,n_classes,size=n_samples))\n",
    "\n",
    "    l_in = InputLayer([None,10],input_X)\n",
    "\n",
    "    #probability estimator\n",
    "    l_sm = HierarchicalSoftmaxDenseLayer(l_in,n_classes,\n",
    "                                         name='hierarchical_softmax')\n",
    "    print 'params',l_sm.get_params()\n",
    "    print 'shape without target',l_sm.output_shape\n",
    "\n",
    "    #target class probability estimators\n",
    "    l_sm_target = HierarchicalSoftmaxDenseLayer(l_in,n_classes,target=target_y[:,None],name='h softmax loss')\n",
    "    print 'shape with target:',l_sm_target.output_shape\n",
    "\n",
    "    l_sm_target_flat = HierarchicalSoftmaxDenseLayer(l_in,n_classes,target=target_y,name='another loss')\n",
    "\n",
    "\n",
    "\n",
    "    out = get_output([l_sm,l_sm_target,l_sm_target_flat])\n",
    "\n",
    "    f = theano.function([],out)\n",
    "\n",
    "    sm0,probas1,probas2 = f()\n",
    "\n",
    "    assert sm0.shape ==(n_samples,n_classes)\n",
    "    assert np.allclose(sm0.sum(1),np.ones(n_samples),0.1)\n",
    "\n",
    "    assert all(probas1 == sm0[np.arange(n_samples),target_y.get_value()])\n",
    "    assert all(probas2 == probas1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "params [hierarchical_softmax.W1, hierarchical_softmax.b1, hierarchical_softmax.W2, hierarchical_softmax.b2]\n",
      "shape without target (None, 1000)\n",
      "shape with target: (None,)\n"
     ]
    }
   ],
   "source": [
    "test_h_softmax_layer(100,1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [Root]",
   "language": "python",
   "name": "Python [Root]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
diff --git a/hierarchical_softmax_layer.py b/hierarchical_softmax_layer.py
 import numpy as np
 import theano.tensor as T
 from lasagne import init
 from lasagne.layers import Layer,MergeLayer, InputLayer,flatten


 class HierarchicalSoftmaxDenseLayer(MergeLayer):
    """
    
    Wraps theano.tensor.nnet.h_softmax for a more convenient usage as lasagne layer.
    
    :param incoming: incoming lasagne layer
    :param num_units: the number of outputs
    :param n_classes: the number of intermediate classes of the two-layer hierarchical softmax.
        It corresponds to the number of outputs of the first softmax. See note at
        the end.  Defaults to sqrt(num_units) or can be inferred from n_outputs_per_class.
    :param n_outputs_per_class: the number of outputs per intermediate class. 
        See note at the end. int, can be inferred
    :param W1: lasagne init or a tensor of shape (number of features of the input x, n_classes)
        the weight matrix of the first softmax, which maps the input x to the
        probabilities of the classes.
    :param b1: lasagne init or a tensor of shape (n_classes,)
        the bias vector of the first softmax layer.
    :param W2: lasagne init or a tensor of shape 
        (n_classes, number of features of the input x, n_outputs_per_class)
        the weight matrix of the second softmax, which maps the input x to
        the probabilities of the outputs.
    :param b2: tensor of shape (n_classes, n_outputs_per_class)
        the bias vector of the second softmax layer.
    :param target: lasagne layer or tensor of shape either (batch_size,) or (batch_size, 1)
        (optional, default None)
        contains the indices of the targets for the minibatch
        input x. For each input, the function computes the output for its
        corresponding target. If target is None, then all the outputs are
        computed for each input.
    
    Notes
    -----
    The product of n_outputs_per_class and n_classes has to be greater or equal
    to n_outputs. If it is strictly greater, then the irrelevant outputs will
    be ignored.
    n_outputs_per_class and n_classes have to be the same as the corresponding
    dimensions of the tensors of W1, b1, W2 and b2.
    The most computational efficient configuration is when n_outputs_per_class
    and n_classes are equal to the square root of n_outputs.

    
        
        """
    def __init__(self,incoming,num_units,
                 n_classes='auto',
                 n_outputs_per_class='auto',
                 W1_init = init.GlorotUniform(),
                 b1_init = init.Constant(0),
                 W2_init = init.GlorotUniform(),
                 b2_init = init.Constant(0),
                 target=None,
                 **kwargs):

        
        
        #flatten input layer if it has higher dimensionality
        if len(incoming.output_shape) != 2:
            assert len(incoming.output_shape) >=2
            incoming = flatten(incoming)
        
        incomings = [incoming]
        
        #add target if provided (as theano tensor or lasagne layer)
        if target is not None:
            
            #convert tensor to layer
            if not isinstance(target,Layer):
                assert target.ndim <=2
                if target.ndim ==1:
                    target_shape = (incoming.shape[0],)
                else:
                    target_shape = (incoming.shape[0],1)
                    
                target = InputLayer(target_shape, input_var=target,name="target inputlayer")
            
            #check shape
            assert len(target.output_shape) <=2
            if len(target.output_shape) ==2:
                assert target.output_shape[1]==1
            
            incomings.append(target)
        
        super(HierarchicalSoftmaxDenseLayer,self).__init__(incomings,**kwargs)
        
        #infer classes
        if n_classes == 'auto':
            if n_outputs_per_class == 'auto':
                n_classes = int(np.ceil(num_units**.5))
            else:
                n_classes = int(np.ceil(float(num_units)/n_outputs_per_class))
        if n_outputs_per_class == 'auto':
            assert n_classes != 'auto'
            n_outputs_per_class = int(np.ceil(float(num_units)/n_classes))
        
        assert n_classes * n_outputs_per_class >= num_units
        
        #remember dimensions
        self.num_units = num_units
        self.n_classes = n_classes
        self.n_outputs_per_class = n_outputs_per_class
        
        #create params
        n_inputs = incoming.output_shape[1]
        self.W1 = self.add_param(W1_init, (n_inputs,self.n_classes),
                                 name="W1")
        self.b1 = self.add_param(b1_init, (self.n_classes,),
                                 name="b1",regularizable=False)
        self.W2 = self.add_param(W2_init, (self.n_classes,n_inputs,self.n_outputs_per_class),
                                 name="W2")
        self.b2 = self.add_param(b2_init, (self.n_classes,self.n_outputs_per_class),
                                 name="b2",regularizable=False)
        
    def get_output_for(self,inputs,return_probas_anyway=False,**kwargs):
        """
        Returns
        -------
        output_probs: tensor of shape (batch_size, n_outputs) or (batch_size)
            Output of the two-layer hierarchical softmax for input x. If target is
            not specified (None), then all the outputs are computed and the
            returned tensor has shape (batch_size, n_outputs). Otherwise, when
            target is specified, only the corresponding outputs are computed and
            the returned tensor has thus shape (batch_size,). 
        return_probas_anyway: if True, returns all probabilities even if target is provided.
            
            
        """
        
        input = inputs[0]
        
        if len(inputs) == 1 or return_probas_anyway:
            target = None
        else:
            assert len(inputs) ==2
            target = inputs[1]
        
        return T.nnet.h_softmax(input,input.shape[0],
                                self.num_units,self.n_classes,
                                self.n_outputs_per_class,
                                W1=self.W1,b1=self.b1,
                                W2=self.W2,b2=self.b2,
                                target=target
                               )
        
    def get_output_shape_for(self,input_shapes,**kwargs):
        if len(input_shapes) ==1:
            return (input_shapes[0][0],self.num_units)
        else:
            return (input_shapes[0][0],)
                                    
diff --git a/License.md b/License.md
diff --git a/performance_test.ipynb b/performance_test.ipynb
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import theano\n",
	"from lasagne.layers import InputLayer,get_output\n",
	"from hierarchical_softmax_layer import HierarchicalSoftmaxDenseLayer\n",
	"\n",
	"def test_h_softmax_layer(n_samples=100,n_classes=1000):\n",
	" input_X = theano.shared(np.zeros([100,10]))\n",
	" target_y = theano.shared(np.random.randint(0,n_classes,size=n_samples))\n",
	"\n",
	" l_in = InputLayer([None,10],input_X)\n",
	"\n",
	" #probability estimator\n",
	" l_sm = HierarchicalSoftmaxDenseLayer(l_in,n_classes,\n",
	" name='hierarchical_softmax')\n",
	" print 'params',l_sm.get_params()\n",
	" print 'shape without target',l_sm.output_shape\n",
	"\n",
	" #target class probability estimators\n",
	" l_sm_target = HierarchicalSoftmaxDenseLayer(l_in,n_classes,target=target_y[:,None],name='h softmax loss')\n",
	" print 'shape with target:',l_sm_target.output_shape\n",
	"\n",
	" l_sm_target_flat = HierarchicalSoftmaxDenseLayer(l_in,n_classes,target=target_y,name='another loss')\n",
	"\n",
	"\n",
	"\n",
	" out = get_output([l_sm,l_sm_target,l_sm_target_flat])\n",
	"\n",
	" f = theano.function([],out)\n",
	"\n",
	" sm0,probas1,probas2 = f()\n",
	"\n",
	" assert sm0.shape ==(n_samples,n_classes)\n",
	" assert np.allclose(sm0.sum(1),np.ones(n_samples),0.1)\n",
	"\n",
	" assert all(probas1 == sm0[np.arange(n_samples),target_y.get_value()])\n",
	" assert all(probas2 == probas1)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"params [hierarchical_softmax.W1, hierarchical_softmax.b1, hierarchical_softmax.W2, hierarchical_softmax.b2]\n",
	"shape without target (None, 1000)\n",
	"shape with target: (None,)\n"
	]
	}
	],
	"source": [
	"test_h_softmax_layer(100,1000)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python [Root]",
	"language": "python",
	"name": "Python [Root]"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}
	import numpy as np
	import theano.tensor as T
	from lasagne import init
	from lasagne.layers import Layer,MergeLayer, InputLayer,flatten


	class HierarchicalSoftmaxDenseLayer(MergeLayer):
	"""

	Wraps theano.tensor.nnet.h_softmax for a more convenient usage as lasagne layer.

	:param incoming: incoming lasagne layer
	:param num_units: the number of outputs
	:param n_classes: the number of intermediate classes of the two-layer hierarchical softmax.
	It corresponds to the number of outputs of the first softmax. See note at
	the end. Defaults to sqrt(num_units) or can be inferred from n_outputs_per_class.
	:param n_outputs_per_class: the number of outputs per intermediate class.
	See note at the end. int, can be inferred
	:param W1: lasagne init or a tensor of shape (number of features of the input x, n_classes)
	the weight matrix of the first softmax, which maps the input x to the
	probabilities of the classes.
	:param b1: lasagne init or a tensor of shape (n_classes,)
	the bias vector of the first softmax layer.
	:param W2: lasagne init or a tensor of shape
	(n_classes, number of features of the input x, n_outputs_per_class)
	the weight matrix of the second softmax, which maps the input x to
	the probabilities of the outputs.
	:param b2: tensor of shape (n_classes, n_outputs_per_class)
	the bias vector of the second softmax layer.
	:param target: lasagne layer or tensor of shape either (batch_size,) or (batch_size, 1)
	(optional, default None)
	contains the indices of the targets for the minibatch
	input x. For each input, the function computes the output for its
	corresponding target. If target is None, then all the outputs are
	computed for each input.

	Notes
	-----
	The product of n_outputs_per_class and n_classes has to be greater or equal
	to n_outputs. If it is strictly greater, then the irrelevant outputs will
	be ignored.
	n_outputs_per_class and n_classes have to be the same as the corresponding
	dimensions of the tensors of W1, b1, W2 and b2.
	The most computational efficient configuration is when n_outputs_per_class
	and n_classes are equal to the square root of n_outputs.



	"""
	def __init__(self,incoming,num_units,
	n_classes='auto',
	n_outputs_per_class='auto',
	W1_init = init.GlorotUniform(),
	b1_init = init.Constant(0),
	W2_init = init.GlorotUniform(),
	b2_init = init.Constant(0),
	target=None,
	**kwargs):



	#flatten input layer if it has higher dimensionality
	if len(incoming.output_shape) != 2:
	assert len(incoming.output_shape) >=2
	incoming = flatten(incoming)

	incomings = [incoming]

	#add target if provided (as theano tensor or lasagne layer)
	if target is not None:

	#convert tensor to layer
	if not isinstance(target,Layer):
	assert target.ndim <=2
	if target.ndim ==1:
	target_shape = (incoming.shape[0],)
	else:
	target_shape = (incoming.shape[0],1)

	target = InputLayer(target_shape, input_var=target,name="target inputlayer")

	#check shape
	assert len(target.output_shape) <=2
	if len(target.output_shape) ==2:
	assert target.output_shape[1]==1

	incomings.append(target)

	super(HierarchicalSoftmaxDenseLayer,self).__init__(incomings,**kwargs)

	#infer classes
	if n_classes == 'auto':
	if n_outputs_per_class == 'auto':
	n_classes = int(np.ceil(num_units**.5))
	else:
	n_classes = int(np.ceil(float(num_units)/n_outputs_per_class))
	if n_outputs_per_class == 'auto':
	assert n_classes != 'auto'
	n_outputs_per_class = int(np.ceil(float(num_units)/n_classes))

	assert n_classes * n_outputs_per_class >= num_units

	#remember dimensions
	self.num_units = num_units
	self.n_classes = n_classes
	self.n_outputs_per_class = n_outputs_per_class

	#create params
	n_inputs = incoming.output_shape[1]
	self.W1 = self.add_param(W1_init, (n_inputs,self.n_classes),
	name="W1")
	self.b1 = self.add_param(b1_init, (self.n_classes,),
	name="b1",regularizable=False)
	self.W2 = self.add_param(W2_init, (self.n_classes,n_inputs,self.n_outputs_per_class),
	name="W2")
	self.b2 = self.add_param(b2_init, (self.n_classes,self.n_outputs_per_class),
	name="b2",regularizable=False)

	def get_output_for(self,inputs,return_probas_anyway=False,**kwargs):
	"""
	Returns
	-------
	output_probs: tensor of shape (batch_size, n_outputs) or (batch_size)
	Output of the two-layer hierarchical softmax for input x. If target is
	not specified (None), then all the outputs are computed and the
	returned tensor has shape (batch_size, n_outputs). Otherwise, when
	target is specified, only the corresponding outputs are computed and
	the returned tensor has thus shape (batch_size,).
	return_probas_anyway: if True, returns all probabilities even if target is provided.


	"""

	input = inputs[0]

	if len(inputs) == 1 or return_probas_anyway:
	target = None
	else:
	assert len(inputs) ==2
	target = inputs[1]

	return T.nnet.h_softmax(input,input.shape[0],
	self.num_units,self.n_classes,
	self.n_outputs_per_class,
	W1=self.W1,b1=self.b1,
	W2=self.W2,b2=self.b2,
	target=target
	)

	def get_output_shape_for(self,input_shapes,**kwargs):
	if len(input_shapes) ==1:
	return (input_shapes[0][0],self.num_units)
	else:
	return (input_shapes[0][0],)