Last active
          March 31, 2016 08:15 
        
      - 
      
- 
        Save shashankg7/aec2303803e7b39b150a9f78cb59db09 to your computer and use it in GitHub Desktop. 
    Word Embedding models (only theano code, for reference)
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | import numpy as np | |
| import theano | |
| from theano import tensor as T | |
| rng = np.random | |
| class Autoencoder(object): | |
| def __init__(self, maxnum, reduced_dims, learnrate=0.4): | |
| self.threshold = 1e-2 | |
| # Input variable (equivalent to dummyword in original implementation) | |
| self.inputs = theano.shared(np.zeros((maxnum, 1), dtype=np.float32)) | |
| self.W1 = theano.shared((rng.randn(reduced_dims, maxnum)*0.1) | |
| .astype(theano.config.floatX), name='W1') | |
| self.W2 = theano.shared((rng.randn(maxnum, reduced_dims)*0.1).astype | |
| (theano.config.floatX), name='W2') | |
| self.output = T.dot(self.W1, self.inputs) | |
| self.recons = T.dot(self.W2, self.output) | |
| self.totloss = T.sum((self.inputs - self.recons)**2) | |
| self.W1_grad = T.clip(T.grad(self.totloss, self.W1), | |
| -1*self.threshold, self.threshold) | |
| self.W2_grad = T.clip(T.grad(self.totloss, self.W2), | |
| -1*self.threshold, self.threshold) | |
| self.updates = [(self.W1, self.W1 - learnrate * self.W1_grad), | |
| (self.W2, self.W2 - learnrate * self.W2_grad)] | |
| self.train = theano.function([], self.totloss, updates=self.updates, | |
| allow_input_downcast=True) | |
| def trainonone(self, wordvec): | |
| wordvec = np.array(wordvec, dtype=np.float32) | |
| self.inputs.set_value(wordvec) | |
| # Gradients w.r.t paramters with values clipped in range (-1*threshold, | |
| # threshold) | |
| self.loss = self.train() | |
| print "Loss incurred : ", self.loss | |
| def getoutput(self, wordvec): | |
| # Returns the embedding given a word (calculate output W1*input | |
| self.inputs.set_value(wordvec) | |
| genembedding = self.output.eval() | |
| return genembedding | |
| ''' | |
| Defines glove model and performs one mini-batch SGD update in theano. | |
| ''' | |
| from theano import tensor as T | |
| import theano | |
| import numpy as np | |
| class glove(object): | |
| def __init__(self, vocab_size, dim, lr=0.05): | |
| W = np.asarray(np.random.rand(vocab_size, dim), | |
| dtype=theano.config.floatX) / float(dim) | |
| W1 = np.asarray(np.random.rand(vocab_size, dim), | |
| dtype=theano.config.floatX) / float(dim) | |
| self.W = theano.shared(W, name='W', borrow=True) | |
| self.W1 = theano.shared(W1, name='W1', borrow=True) | |
| gW = np.asarray(np.ones((vocab_size, dim)), dtype=theano.config.floatX) | |
| gW1 = np.asarray(np.ones((vocab_size, dim)), dtype=theano.config.floatX) | |
| self.gW = theano.shared(gW, name='gW', borrow=True) | |
| self.gW1 = theano.shared(gW1, name='gW1', borrow=True) | |
| X = T.vector() | |
| fX = T.vector() | |
| ind_W = T.ivector() | |
| ind_W1 = T.ivector() | |
| w = self.W[ind_W, :] | |
| w1 = self.W1[ind_W1, :] | |
| cost = T.sum(fX * ((T.sum(w * w1, axis=1) - X) ** 2)) | |
| grad = T.clip(T.grad(cost, [w, w1]), -5.0, 5.0) | |
| updates1 = [(self.gW, T.inc_subtensor(self.gW[ind_W, :], | |
| grad[0] ** 2))] | |
| updates2 = [(self.gW1, T.inc_subtensor(self.gW1[ind_W1, :], | |
| grad[1] ** 2))] | |
| updates3 = [(self.W, T.inc_subtensor(self.W[ind_W, :], | |
| - (lr / T.sqrt(self.gW[ind_W, :])) * | |
| grad[0]))] | |
| updates4 = [(self.W1, T.inc_subtensor(self.W1[ind_W1, :], | |
| - (lr / T.sqrt(self.gW1[ind_W1, :])) * | |
| grad[1]))] | |
| updates = updates1 + updates2 + updates3 + updates4 | |
| self.cost_fn = theano.function(inputs=[ind_W, ind_W1, X, fX],outputs=cost, updates=updates) | |
| def sgd(self,indw, indw1, X, fX): | |
| ''' | |
| Performs one iteration of SGD. | |
| ''' | |
| return self.cost_fn(indw, indw1, X, fX) | |
| def save_params(self): | |
| ''' | |
| Saves the word embedding lookup matrix to file. | |
| ''' | |
| W = self.W.get_value() + self.W1.get_value() | |
| np.save('lookup', W) | |
| nnz = coocur.coocur_mat.nonzero() | |
| model = glove(vocab_size, self.dim) | |
| # nnz has i,j indices of non-zero entries | |
| nz = np.zeros((nnz[0].shape[0], 2)) | |
| nz[:, 0] = nnz[0] | |
| nz[:, 1] = nnz[1] | |
| np.random.shuffle(nz) | |
| print "Starting training, brace yourself" | |
| for epoch in xrange(self.n_epochs): | |
| for i in xrange(0, nnz[0].shape[0], self.minibatch_size): | |
| indw = np.asarray(nz[i:(i+self.minibatch_size), 0], dtype=np.int32) | |
| indw1 = np.asarray(nz[i:(i+self.minibatch_size), 1], dtype=np.int32) | |
| batch_size = indw.shape[0] | |
| X = np.asarray(coocur.coocur_mat[indw, | |
| indw1].todense(), dtype=theano.config.floatX).reshape(batch_size,) | |
| fX = np.zeros_like(X) | |
| for i in xrange(0, X.shape[0]): | |
| if X[i] > 100: | |
| fX[i] = (X[i] / 100) ** 0.75 | |
| else: | |
| fX[i] = 1. | |
| X = np.log(X) | |
| cost = model.sgd(indw, indw1, X, fX) | |
| print "Cost in epoch %d is %f" %(epoch, cost) | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment