Feel free to use this layer anywhere and/or include to any library with or without notifying us. tldr, do whatever you want.
Devised by justheuristic@, khalman@ and https://github.com/ADmitri42.
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import theano\n", | |
"from lasagne.layers import InputLayer,get_output\n", | |
"from hierarchical_softmax_layer import HierarchicalSoftmaxDenseLayer\n", | |
"\n", | |
"def test_h_softmax_layer(n_samples=100,n_classes=1000):\n", | |
" input_X = theano.shared(np.zeros([100,10]))\n", | |
" target_y = theano.shared(np.random.randint(0,n_classes,size=n_samples))\n", | |
"\n", | |
" l_in = InputLayer([None,10],input_X)\n", | |
"\n", | |
" #probability estimator\n", | |
" l_sm = HierarchicalSoftmaxDenseLayer(l_in,n_classes,\n", | |
" name='hierarchical_softmax')\n", | |
" print 'params',l_sm.get_params()\n", | |
" print 'shape without target',l_sm.output_shape\n", | |
"\n", | |
" #target class probability estimators\n", | |
" l_sm_target = HierarchicalSoftmaxDenseLayer(l_in,n_classes,target=target_y[:,None],name='h softmax loss')\n", | |
" print 'shape with target:',l_sm_target.output_shape\n", | |
"\n", | |
" l_sm_target_flat = HierarchicalSoftmaxDenseLayer(l_in,n_classes,target=target_y,name='another loss')\n", | |
"\n", | |
"\n", | |
"\n", | |
" out = get_output([l_sm,l_sm_target,l_sm_target_flat])\n", | |
"\n", | |
" f = theano.function([],out)\n", | |
"\n", | |
" sm0,probas1,probas2 = f()\n", | |
"\n", | |
" assert sm0.shape ==(n_samples,n_classes)\n", | |
" assert np.allclose(sm0.sum(1),np.ones(n_samples),0.1)\n", | |
"\n", | |
" assert all(probas1 == sm0[np.arange(n_samples),target_y.get_value()])\n", | |
" assert all(probas2 == probas1)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"params [hierarchical_softmax.W1, hierarchical_softmax.b1, hierarchical_softmax.W2, hierarchical_softmax.b2]\n", | |
"shape without target (None, 1000)\n", | |
"shape with target: (None,)\n" | |
] | |
} | |
], | |
"source": [ | |
"test_h_softmax_layer(100,1000)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python [Root]", | |
"language": "python", | |
"name": "Python [Root]" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
import numpy as np | |
import theano.tensor as T | |
from lasagne import init | |
from lasagne.layers import Layer,MergeLayer, InputLayer,flatten | |
class HierarchicalSoftmaxDenseLayer(MergeLayer): | |
""" | |
Wraps theano.tensor.nnet.h_softmax for a more convenient usage as lasagne layer. | |
:param incoming: incoming lasagne layer | |
:param num_units: the number of outputs | |
:param n_classes: the number of intermediate classes of the two-layer hierarchical softmax. | |
It corresponds to the number of outputs of the first softmax. See note at | |
the end. Defaults to sqrt(num_units) or can be inferred from n_outputs_per_class. | |
:param n_outputs_per_class: the number of outputs per intermediate class. | |
See note at the end. int, can be inferred | |
:param W1: lasagne init or a tensor of shape (number of features of the input x, n_classes) | |
the weight matrix of the first softmax, which maps the input x to the | |
probabilities of the classes. | |
:param b1: lasagne init or a tensor of shape (n_classes,) | |
the bias vector of the first softmax layer. | |
:param W2: lasagne init or a tensor of shape | |
(n_classes, number of features of the input x, n_outputs_per_class) | |
the weight matrix of the second softmax, which maps the input x to | |
the probabilities of the outputs. | |
:param b2: tensor of shape (n_classes, n_outputs_per_class) | |
the bias vector of the second softmax layer. | |
:param target: lasagne layer or tensor of shape either (batch_size,) or (batch_size, 1) | |
(optional, default None) | |
contains the indices of the targets for the minibatch | |
input x. For each input, the function computes the output for its | |
corresponding target. If target is None, then all the outputs are | |
computed for each input. | |
Notes | |
----- | |
The product of n_outputs_per_class and n_classes has to be greater or equal | |
to n_outputs. If it is strictly greater, then the irrelevant outputs will | |
be ignored. | |
n_outputs_per_class and n_classes have to be the same as the corresponding | |
dimensions of the tensors of W1, b1, W2 and b2. | |
The most computational efficient configuration is when n_outputs_per_class | |
and n_classes are equal to the square root of n_outputs. | |
""" | |
def __init__(self,incoming,num_units, | |
n_classes='auto', | |
n_outputs_per_class='auto', | |
W1_init = init.GlorotUniform(), | |
b1_init = init.Constant(0), | |
W2_init = init.GlorotUniform(), | |
b2_init = init.Constant(0), | |
target=None, | |
**kwargs): | |
#flatten input layer if it has higher dimensionality | |
if len(incoming.output_shape) != 2: | |
assert len(incoming.output_shape) >=2 | |
incoming = flatten(incoming) | |
incomings = [incoming] | |
#add target if provided (as theano tensor or lasagne layer) | |
if target is not None: | |
#convert tensor to layer | |
if not isinstance(target,Layer): | |
assert target.ndim <=2 | |
if target.ndim ==1: | |
target_shape = (incoming.shape[0],) | |
else: | |
target_shape = (incoming.shape[0],1) | |
target = InputLayer(target_shape, input_var=target,name="target inputlayer") | |
#check shape | |
assert len(target.output_shape) <=2 | |
if len(target.output_shape) ==2: | |
assert target.output_shape[1]==1 | |
incomings.append(target) | |
super(HierarchicalSoftmaxDenseLayer,self).__init__(incomings,**kwargs) | |
#infer classes | |
if n_classes == 'auto': | |
if n_outputs_per_class == 'auto': | |
n_classes = int(np.ceil(num_units**.5)) | |
else: | |
n_classes = int(np.ceil(float(num_units)/n_outputs_per_class)) | |
if n_outputs_per_class == 'auto': | |
assert n_classes != 'auto' | |
n_outputs_per_class = int(np.ceil(float(num_units)/n_classes)) | |
assert n_classes * n_outputs_per_class >= num_units | |
#remember dimensions | |
self.num_units = num_units | |
self.n_classes = n_classes | |
self.n_outputs_per_class = n_outputs_per_class | |
#create params | |
n_inputs = incoming.output_shape[1] | |
self.W1 = self.add_param(W1_init, (n_inputs,self.n_classes), | |
name="W1") | |
self.b1 = self.add_param(b1_init, (self.n_classes,), | |
name="b1",regularizable=False) | |
self.W2 = self.add_param(W2_init, (self.n_classes,n_inputs,self.n_outputs_per_class), | |
name="W2") | |
self.b2 = self.add_param(b2_init, (self.n_classes,self.n_outputs_per_class), | |
name="b2",regularizable=False) | |
def get_output_for(self,inputs,return_probas_anyway=False,**kwargs): | |
""" | |
Returns | |
------- | |
output_probs: tensor of shape (batch_size, n_outputs) or (batch_size) | |
Output of the two-layer hierarchical softmax for input x. If target is | |
not specified (None), then all the outputs are computed and the | |
returned tensor has shape (batch_size, n_outputs). Otherwise, when | |
target is specified, only the corresponding outputs are computed and | |
the returned tensor has thus shape (batch_size,). | |
return_probas_anyway: if True, returns all probabilities even if target is provided. | |
""" | |
input = inputs[0] | |
if len(inputs) == 1 or return_probas_anyway: | |
target = None | |
else: | |
assert len(inputs) ==2 | |
target = inputs[1] | |
return T.nnet.h_softmax(input,input.shape[0], | |
self.num_units,self.n_classes, | |
self.n_outputs_per_class, | |
W1=self.W1,b1=self.b1, | |
W2=self.W2,b2=self.b2, | |
target=target | |
) | |
def get_output_shape_for(self,input_shapes,**kwargs): | |
if len(input_shapes) ==1: | |
return (input_shapes[0][0],self.num_units) | |
else: | |
return (input_shapes[0][0],) | |
Feel free to use this layer anywhere and/or include to any library with or without notifying us. tldr, do whatever you want.
Devised by justheuristic@, khalman@ and https://github.com/ADmitri42.