JonathanRaiman · August 29, 2015 14:07
diff --git a/Stanford GloVe b/Stanford GloVe
 # coding: utf-8
 #
 # @author Jonathan Raiman
 # @date 9th October 2014
 #
 # Messing around with Stanford's GloVe words
 # Download them [here](http://www-nlp.stanford.edu/projects/glove/)

 import gzip, numpy as np, io

 class GloveModel:
  def __init__(self, path):
    self.path = path;
    self.load_model()
    
  def load_model(self):
    vecs = []
    self.index2word = []
    self.word2index = {}
    with gzip.open(self.path, "rt") as f:
        for i, line in enumerate(f):
            index = line.find(' ')
            word = line[:index]
            self.word2index[word] = i
            self.index2word.append(word)
            vec = np.loadtxt(io.StringIO(line[index:]), dtype=np.float32)
            vec = vec / np.linalg.norm(vec)
            vecs.append(vec)
    vecs = np.vstack(vecs)
    self.model_matrix = vecs
    
    def sentence_to_vec(self, sentence):
        indices = [self.word2index.get(word, None) for word in sentence.split(" ")]
        num_words = 0
        vectors = np.zeros(self.model_matrix.shape[1], dtype = self.model_matrix.dtype)
        for i in indices:
            if i != None:
                num_words += 1
                vectors += self.model_matrix[i]
        if num_words > 0:
          # take average and normalize
          vectors = vectors / num_words
          vectors = vectors / np.linalg.norm(vectors)
        return vectors
    
    def sentence_similarity(self, sentence_a, sentence_b):
        return np.dot(self.sentence_to_vect(sentnence_a), self.sentence_to_vect(sentence_b))
        
    def most_similar_words(sentence, topn = 10):
        vector = self.sentence_to_vec(sentence)
        dists = np.dot(self.model_matrix, vector)
        best = np.argsort(dists)[::-1][0:topn + 1]
        words = [(self.index2word[k], k, dists[k]) for k in best]
        return words

 model = GloveModel("/Users/jonathanraiman/Desktop/glove_words.gz")


 print(model.most_similar_words("take me to a chinese restaurant in 15 minutes"))

 print(model.sentence_similarity("15", "100"))
	# coding: utf-8
	#
	# @author Jonathan Raiman
	# @date 9th October 2014
	#
	# Messing around with Stanford's GloVe words
	# Download them [here](http://www-nlp.stanford.edu/projects/glove/)

	import gzip, numpy as np, io

	class GloveModel:
	def __init__(self, path):
	self.path = path;
	self.load_model()

	def load_model(self):
	vecs = []
	self.index2word = []
	self.word2index = {}
	with gzip.open(self.path, "rt") as f:
	for i, line in enumerate(f):
	index = line.find(' ')
	word = line[:index]
	self.word2index[word] = i
	self.index2word.append(word)
	vec = np.loadtxt(io.StringIO(line[index:]), dtype=np.float32)
	vec = vec / np.linalg.norm(vec)
	vecs.append(vec)
	vecs = np.vstack(vecs)
	self.model_matrix = vecs

	def sentence_to_vec(self, sentence):
	indices = [self.word2index.get(word, None) for word in sentence.split(" ")]
	num_words = 0
	vectors = np.zeros(self.model_matrix.shape[1], dtype = self.model_matrix.dtype)
	for i in indices:
	if i != None:
	num_words += 1
	vectors += self.model_matrix[i]
	if num_words > 0:
	# take average and normalize
	vectors = vectors / num_words
	vectors = vectors / np.linalg.norm(vectors)
	return vectors

	def sentence_similarity(self, sentence_a, sentence_b):
	return np.dot(self.sentence_to_vect(sentnence_a), self.sentence_to_vect(sentence_b))

	def most_similar_words(sentence, topn = 10):
	vector = self.sentence_to_vec(sentence)
	dists = np.dot(self.model_matrix, vector)
	best = np.argsort(dists)[::-1][0:topn + 1]
	words = [(self.index2word[k], k, dists[k]) for k in best]
	return words

	model = GloveModel("/Users/jonathanraiman/Desktop/glove_words.gz")


	print(model.most_similar_words("take me to a chinese restaurant in 15 minutes"))

	print(model.sentence_similarity("15", "100"))