shashankg7 · March 31, 2016 08:15
diff --git a/word_embeddings.py b/word_embeddings.py
 import numpy as np
 import theano
 from theano import tensor as T


 rng = np.random


 class Autoencoder(object):
    def __init__(self, maxnum, reduced_dims, learnrate=0.4):
        self.threshold = 1e-2
        # Input variable (equivalent to dummyword in original implementation)
        self.inputs = theano.shared(np.zeros((maxnum, 1), dtype=np.float32))
        self.W1 = theano.shared((rng.randn(reduced_dims, maxnum)*0.1)
                                .astype(theano.config.floatX), name='W1')
        self.W2 = theano.shared((rng.randn(maxnum, reduced_dims)*0.1).astype
                                (theano.config.floatX), name='W2')
        self.output = T.dot(self.W1, self.inputs)
        self.recons = T.dot(self.W2, self.output)
        self.totloss = T.sum((self.inputs - self.recons)**2)
        self.W1_grad = T.clip(T.grad(self.totloss, self.W1),
                         -1*self.threshold, self.threshold)
        self.W2_grad = T.clip(T.grad(self.totloss, self.W2),
                         -1*self.threshold, self.threshold)
        self.updates = [(self.W1, self.W1 - learnrate * self.W1_grad),
                   (self.W2, self.W2 - learnrate * self.W2_grad)]
        self.train = theano.function([], self.totloss, updates=self.updates,
                                allow_input_downcast=True)

    def trainonone(self, wordvec):
        wordvec = np.array(wordvec, dtype=np.float32)
        self.inputs.set_value(wordvec)
        # Gradients w.r.t paramters with values clipped in range (-1*threshold,
        # threshold)
        self.loss = self.train()
        print "Loss incurred : ", self.loss

    def getoutput(self, wordvec):
        # Returns the embedding given a word (calculate output W1*input
        self.inputs.set_value(wordvec)
        genembedding = self.output.eval()
        return genembedding
  
  '''
 Defines glove model and performs one mini-batch SGD update in theano.
 '''

 from theano import tensor as T
 import theano
 import numpy as np

 class glove(object):

    def __init__(self, vocab_size, dim, lr=0.05):
        W = np.asarray(np.random.rand(vocab_size, dim),
                                      dtype=theano.config.floatX) / float(dim)
        W1 = np.asarray(np.random.rand(vocab_size, dim),
                                      dtype=theano.config.floatX) / float(dim) 
        self.W = theano.shared(W, name='W', borrow=True)
        self.W1 = theano.shared(W1, name='W1', borrow=True)
        gW = np.asarray(np.ones((vocab_size, dim)), dtype=theano.config.floatX)
        gW1 = np.asarray(np.ones((vocab_size, dim)), dtype=theano.config.floatX)
        self.gW = theano.shared(gW, name='gW', borrow=True)
        self.gW1 = theano.shared(gW1, name='gW1', borrow=True)
        X = T.vector()
        fX = T.vector()
        ind_W = T.ivector()
        ind_W1 = T.ivector()

        w = self.W[ind_W, :]
        w1 = self.W1[ind_W1, :]
        cost = T.sum(fX * ((T.sum(w * w1, axis=1) - X) ** 2))
        grad = T.clip(T.grad(cost, [w, w1]), -5.0, 5.0)
        updates1 = [(self.gW, T.inc_subtensor(self.gW[ind_W, :],
                                              grad[0] ** 2))]
        updates2 = [(self.gW1, T.inc_subtensor(self.gW1[ind_W1, :],
                                               grad[1] ** 2))]
        updates3 = [(self.W, T.inc_subtensor(self.W[ind_W, :],
                                            - (lr / T.sqrt(self.gW[ind_W, :])) *
                                            grad[0]))]
        updates4 = [(self.W1, T.inc_subtensor(self.W1[ind_W1, :],
                                            - (lr / T.sqrt(self.gW1[ind_W1, :])) *
                                            grad[1]))]
        updates = updates1 + updates2 + updates3 + updates4
        self.cost_fn = theano.function(inputs=[ind_W, ind_W1, X, fX],outputs=cost, updates=updates)

    def sgd(self,indw, indw1, X, fX):
        '''
        Performs one iteration of SGD.
        '''
        return self.cost_fn(indw, indw1, X, fX)

    def save_params(self):
        '''
        Saves the word embedding lookup matrix to file.
        '''
        W = self.W.get_value() + self.W1.get_value()
        np.save('lookup', W)

 nnz = coocur.coocur_mat.nonzero()
 model = glove(vocab_size, self.dim)
 # nnz has i,j indices of non-zero entries
 nz = np.zeros((nnz[0].shape[0], 2))
 nz[:, 0] = nnz[0]
 nz[:, 1] = nnz[1]
 np.random.shuffle(nz)
 print "Starting training, brace yourself"
 for epoch in xrange(self.n_epochs):
    for i in xrange(0, nnz[0].shape[0], self.minibatch_size):
            indw = np.asarray(nz[i:(i+self.minibatch_size), 0], dtype=np.int32)
            indw1 = np.asarray(nz[i:(i+self.minibatch_size), 1], dtype=np.int32)
            batch_size = indw.shape[0]
            X = np.asarray(coocur.coocur_mat[indw,
                                indw1].todense(), dtype=theano.config.floatX).reshape(batch_size,)
            fX = np.zeros_like(X)
            for i in xrange(0, X.shape[0]):
                if X[i] > 100:
                    fX[i] = (X[i] / 100) ** 0.75
                else:
                    fX[i] = 1.
            X = np.log(X)
            cost = model.sgd(indw, indw1, X, fX)
            print "Cost in epoch %d is %f" %(epoch, cost)
	import numpy as np
	import theano
	from theano import tensor as T


	rng = np.random


	class Autoencoder(object):
	def __init__(self, maxnum, reduced_dims, learnrate=0.4):
	self.threshold = 1e-2
	# Input variable (equivalent to dummyword in original implementation)
	self.inputs = theano.shared(np.zeros((maxnum, 1), dtype=np.float32))
	self.W1 = theano.shared((rng.randn(reduced_dims, maxnum)*0.1)
	.astype(theano.config.floatX), name='W1')
	self.W2 = theano.shared((rng.randn(maxnum, reduced_dims)*0.1).astype
	(theano.config.floatX), name='W2')
	self.output = T.dot(self.W1, self.inputs)
	self.recons = T.dot(self.W2, self.output)
	self.totloss = T.sum((self.inputs - self.recons)**2)
	self.W1_grad = T.clip(T.grad(self.totloss, self.W1),
	-1*self.threshold, self.threshold)
	self.W2_grad = T.clip(T.grad(self.totloss, self.W2),
	-1*self.threshold, self.threshold)
	self.updates = [(self.W1, self.W1 - learnrate * self.W1_grad),
	(self.W2, self.W2 - learnrate * self.W2_grad)]
	self.train = theano.function([], self.totloss, updates=self.updates,
	allow_input_downcast=True)

	def trainonone(self, wordvec):
	wordvec = np.array(wordvec, dtype=np.float32)
	self.inputs.set_value(wordvec)
	# Gradients w.r.t paramters with values clipped in range (-1*threshold,
	# threshold)
	self.loss = self.train()
	print "Loss incurred : ", self.loss

	def getoutput(self, wordvec):
	# Returns the embedding given a word (calculate output W1*input
	self.inputs.set_value(wordvec)
	genembedding = self.output.eval()
	return genembedding

	'''
	Defines glove model and performs one mini-batch SGD update in theano.
	'''

	from theano import tensor as T
	import theano
	import numpy as np

	class glove(object):

	def __init__(self, vocab_size, dim, lr=0.05):
	W = np.asarray(np.random.rand(vocab_size, dim),
	dtype=theano.config.floatX) / float(dim)
	W1 = np.asarray(np.random.rand(vocab_size, dim),
	dtype=theano.config.floatX) / float(dim)
	self.W = theano.shared(W, name='W', borrow=True)
	self.W1 = theano.shared(W1, name='W1', borrow=True)
	gW = np.asarray(np.ones((vocab_size, dim)), dtype=theano.config.floatX)
	gW1 = np.asarray(np.ones((vocab_size, dim)), dtype=theano.config.floatX)
	self.gW = theano.shared(gW, name='gW', borrow=True)
	self.gW1 = theano.shared(gW1, name='gW1', borrow=True)
	X = T.vector()
	fX = T.vector()
	ind_W = T.ivector()
	ind_W1 = T.ivector()

	w = self.W[ind_W, :]
	w1 = self.W1[ind_W1, :]
	cost = T.sum(fX * ((T.sum(w * w1, axis=1) - X) ** 2))
	grad = T.clip(T.grad(cost, [w, w1]), -5.0, 5.0)
	updates1 = [(self.gW, T.inc_subtensor(self.gW[ind_W, :],
	grad[0] ** 2))]
	updates2 = [(self.gW1, T.inc_subtensor(self.gW1[ind_W1, :],
	grad[1] ** 2))]
	updates3 = [(self.W, T.inc_subtensor(self.W[ind_W, :],
	- (lr / T.sqrt(self.gW[ind_W, :])) *
	grad[0]))]
	updates4 = [(self.W1, T.inc_subtensor(self.W1[ind_W1, :],
	- (lr / T.sqrt(self.gW1[ind_W1, :])) *
	grad[1]))]
	updates = updates1 + updates2 + updates3 + updates4
	self.cost_fn = theano.function(inputs=[ind_W, ind_W1, X, fX],outputs=cost, updates=updates)

	def sgd(self,indw, indw1, X, fX):
	'''
	Performs one iteration of SGD.
	'''
	return self.cost_fn(indw, indw1, X, fX)

	def save_params(self):
	'''
	Saves the word embedding lookup matrix to file.
	'''
	W = self.W.get_value() + self.W1.get_value()
	np.save('lookup', W)

	nnz = coocur.coocur_mat.nonzero()
	model = glove(vocab_size, self.dim)
	# nnz has i,j indices of non-zero entries
	nz = np.zeros((nnz[0].shape[0], 2))
	nz[:, 0] = nnz[0]
	nz[:, 1] = nnz[1]
	np.random.shuffle(nz)
	print "Starting training, brace yourself"
	for epoch in xrange(self.n_epochs):
	for i in xrange(0, nnz[0].shape[0], self.minibatch_size):
	indw = np.asarray(nz[i:(i+self.minibatch_size), 0], dtype=np.int32)
	indw1 = np.asarray(nz[i:(i+self.minibatch_size), 1], dtype=np.int32)
	batch_size = indw.shape[0]
	X = np.asarray(coocur.coocur_mat[indw,
	indw1].todense(), dtype=theano.config.floatX).reshape(batch_size,)
	fX = np.zeros_like(X)
	for i in xrange(0, X.shape[0]):
	if X[i] > 100:
	fX[i] = (X[i] / 100) ** 0.75
	else:
	fX[i] = 1.
	X = np.log(X)
	cost = model.sgd(indw, indw1, X, fX)
	print "Cost in epoch %d is %f" %(epoch, cost)