Feel free to use this layer anywhere and/or include to any library with or without notifying us. tldr, do whatever you want.
Devised by justheuristic@, khalman@ and https://github.com/ADmitri42.
| import numpy as np | |
| import theano.tensor as T | |
| from lasagne import init | |
| from lasagne.layers import Layer,MergeLayer, InputLayer,flatten | |
| class HierarchicalSoftmaxDenseLayer(MergeLayer): | |
| """ | |
| Wraps theano.tensor.nnet.h_softmax for a more convenient usage as lasagne layer. | |
| :param incoming: incoming lasagne layer | |
| :param num_units: the number of outputs | |
| :param n_classes: the number of intermediate classes of the two-layer hierarchical softmax. | |
| It corresponds to the number of outputs of the first softmax. See note at | |
| the end. Defaults to sqrt(num_units) or can be inferred from n_outputs_per_class. | |
| :param n_outputs_per_class: the number of outputs per intermediate class. | |
| See note at the end. int, can be inferred | |
| :param W1: lasagne init or a tensor of shape (number of features of the input x, n_classes) | |
| the weight matrix of the first softmax, which maps the input x to the | |
| probabilities of the classes. | |
| :param b1: lasagne init or a tensor of shape (n_classes,) | |
| the bias vector of the first softmax layer. | |
| :param W2: lasagne init or a tensor of shape | |
| (n_classes, number of features of the input x, n_outputs_per_class) | |
| the weight matrix of the second softmax, which maps the input x to | |
| the probabilities of the outputs. | |
| :param b2: tensor of shape (n_classes, n_outputs_per_class) | |
| the bias vector of the second softmax layer. | |
| :param target: lasagne layer or tensor of shape either (batch_size,) or (batch_size, 1) | |
| (optional, default None) | |
| contains the indices of the targets for the minibatch | |
| input x. For each input, the function computes the output for its | |
| corresponding target. If target is None, then all the outputs are | |
| computed for each input. | |
| Notes | |
| ----- | |
| The product of n_outputs_per_class and n_classes has to be greater or equal | |
| to n_outputs. If it is strictly greater, then the irrelevant outputs will | |
| be ignored. | |
| n_outputs_per_class and n_classes have to be the same as the corresponding | |
| dimensions of the tensors of W1, b1, W2 and b2. | |
| The most computational efficient configuration is when n_outputs_per_class | |
| and n_classes are equal to the square root of n_outputs. | |
| """ | |
| def __init__(self,incoming,num_units, | |
| n_classes='auto', | |
| n_outputs_per_class='auto', | |
| W1_init = init.GlorotUniform(), | |
| b1_init = init.Constant(0), | |
| W2_init = init.GlorotUniform(), | |
| b2_init = init.Constant(0), | |
| target=None, | |
| **kwargs): | |
| #flatten input layer if it has higher dimensionality | |
| if len(incoming.output_shape) != 2: | |
| assert len(incoming.output_shape) >=2 | |
| incoming = flatten(incoming) | |
| incomings = [incoming] | |
| #add target if provided (as theano tensor or lasagne layer) | |
| if target is not None: | |
| #convert tensor to layer | |
| if not isinstance(target,Layer): | |
| assert target.ndim <=2 | |
| if target.ndim ==1: | |
| target_shape = (incoming.shape[0],) | |
| else: | |
| target_shape = (incoming.shape[0],1) | |
| target = InputLayer(target_shape, input_var=target,name="target inputlayer") | |
| #check shape | |
| assert len(target.output_shape) <=2 | |
| if len(target.output_shape) ==2: | |
| assert target.output_shape[1]==1 | |
| incomings.append(target) | |
| super(HierarchicalSoftmaxDenseLayer,self).__init__(incomings,**kwargs) | |
| #infer classes | |
| if n_classes == 'auto': | |
| if n_outputs_per_class == 'auto': | |
| n_classes = int(np.ceil(num_units**.5)) | |
| else: | |
| n_classes = int(np.ceil(float(num_units)/n_outputs_per_class)) | |
| if n_outputs_per_class == 'auto': | |
| assert n_classes != 'auto' | |
| n_outputs_per_class = int(np.ceil(float(num_units)/n_classes)) | |
| assert n_classes * n_outputs_per_class >= num_units | |
| #remember dimensions | |
| self.num_units = num_units | |
| self.n_classes = n_classes | |
| self.n_outputs_per_class = n_outputs_per_class | |
| #create params | |
| n_inputs = incoming.output_shape[1] | |
| self.W1 = self.add_param(W1_init, (n_inputs,self.n_classes), | |
| name="W1") | |
| self.b1 = self.add_param(b1_init, (self.n_classes,), | |
| name="b1",regularizable=False) | |
| self.W2 = self.add_param(W2_init, (self.n_classes,n_inputs,self.n_outputs_per_class), | |
| name="W2") | |
| self.b2 = self.add_param(b2_init, (self.n_classes,self.n_outputs_per_class), | |
| name="b2",regularizable=False) | |
| def get_output_for(self,inputs,return_probas_anyway=False,**kwargs): | |
| """ | |
| Returns | |
| ------- | |
| output_probs: tensor of shape (batch_size, n_outputs) or (batch_size) | |
| Output of the two-layer hierarchical softmax for input x. If target is | |
| not specified (None), then all the outputs are computed and the | |
| returned tensor has shape (batch_size, n_outputs). Otherwise, when | |
| target is specified, only the corresponding outputs are computed and | |
| the returned tensor has thus shape (batch_size,). | |
| return_probas_anyway: if True, returns all probabilities even if target is provided. | |
| """ | |
| input = inputs[0] | |
| if len(inputs) == 1 or return_probas_anyway: | |
| target = None | |
| else: | |
| assert len(inputs) ==2 | |
| target = inputs[1] | |
| return T.nnet.h_softmax(input,input.shape[0], | |
| self.num_units,self.n_classes, | |
| self.n_outputs_per_class, | |
| W1=self.W1,b1=self.b1, | |
| W2=self.W2,b2=self.b2, | |
| target=target | |
| ) | |
| def get_output_shape_for(self,input_shapes,**kwargs): | |
| if len(input_shapes) ==1: | |
| return (input_shapes[0][0],self.num_units) | |
| else: | |
| return (input_shapes[0][0],) | |
Feel free to use this layer anywhere and/or include to any library with or without notifying us. tldr, do whatever you want.
Devised by justheuristic@, khalman@ and https://github.com/ADmitri42.
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Tue Nov 1 22:47:20 2016 \r\n", | |
| "+------------------------------------------------------+ \r\n", | |
| "| NVIDIA-SMI 361.45 Driver Version: 361.45.11 | \r\n", | |
| "|-------------------------------+----------------------+----------------------+\r\n", | |
| "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n", | |
| "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n", | |
| "|===============================+======================+======================|\r\n", | |
| "| 0 GeForce GTX TIT... Off | 0000:02:00.0 Off | N/A |\r\n", | |
| "| 22% 50C P8 16W / 250W | 584MiB / 12287MiB | 0% Default |\r\n", | |
| "+-------------------------------+----------------------+----------------------+\r\n", | |
| "| 1 GeForce GTX 680 Off | 0000:03:00.0 N/A | N/A |\r\n", | |
| "| 40% 45C P8 N/A / N/A | 8MiB / 2047MiB | N/A Default |\r\n", | |
| "+-------------------------------+----------------------+----------------------+\r\n", | |
| " \r\n", | |
| "+-----------------------------------------------------------------------------+\r\n", | |
| "| Processes: GPU Memory |\r\n", | |
| "| GPU PID Type Process name Usage |\r\n", | |
| "|=============================================================================|\r\n", | |
| "| 0 3530 C /home/apanin/anaconda/bin/python 204MiB |\r\n", | |
| "| 0 7414 C /home/apanin/anaconda/bin/python 176MiB |\r\n", | |
| "| 0 16363 C /home/apanin/anaconda/bin/python 175MiB |\r\n", | |
| "| 1 Not Supported |\r\n", | |
| "+-----------------------------------------------------------------------------+\r\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!nvidia-smi" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "env: THEANO_FLAGS=\"device=gpu0\"\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%env THEANO_FLAGS=\"device=gpu0\"" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN 5005)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import theano\n", | |
| "import theano.tensor as T\n", | |
| "theano.config.floatX='float32'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from lasagne.layers import *\n", | |
| "import lasagne\n", | |
| "from hierarchical_softmax_layer import HierarchicalSoftmaxDenseLayer" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "l_in = InputLayer([100,256])\n", | |
| "target_y = T.ivector()\n", | |
| "n_outputs = 10**6\n", | |
| "l_out_softmax = DenseLayer(l_in, n_outputs, nonlinearity=lasagne.nonlinearities.softmax)\n", | |
| "l_out_hsoftmax = HierarchicalSoftmaxDenseLayer(l_in,n_outputs,target=target_y)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import numpy as np\n", | |
| "data = [(np.random.normal(size=[100,256]).astype('float32'),\n", | |
| " np.random.randint(0,n_outputs,size=[100],dtype='int32')) for i in range(100)]\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from lasagne.objectives import categorical_crossentropy as loss\n", | |
| "f_softmax = theano.function([l_in.input_var,target_y],loss(get_output(l_out_softmax),target_y).mean())\n", | |
| "f_hsoftmax = theano.function([l_in.input_var,target_y],get_output(l_out_hsoftmax).mean())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 4.47 s, sys: 859 ms, total: 5.33 s\n", | |
| "Wall time: 5.33 s\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "#volatile gpu util 80~100%, titanx, NO CNMEM!\n", | |
| "for batch in data:\n", | |
| " f_softmax(*batch)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 129 ms, sys: 74 ms, total: 203 ms\n", | |
| "Wall time: 202 ms\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "#volatile gpu util 90~100%, titanx, NO CNMEM!\n", | |
| "for batch in data:\n", | |
| " f_hsoftmax(*batch)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 2", | |
| "language": "python", | |
| "name": "python2" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 2 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython2", | |
| "version": "2.7.11" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |