Last active
August 29, 2015 14:07
-
-
Save JonathanRaiman/0d45d1ab214119cf45eb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# | |
# @author Jonathan Raiman | |
# @date 9th October 2014 | |
# | |
# Messing around with Stanford's GloVe words | |
# Download them [here](http://www-nlp.stanford.edu/projects/glove/) | |
import gzip, numpy as np, io | |
class GloveModel: | |
def __init__(self, path): | |
self.path = path; | |
self.load_model() | |
def load_model(self): | |
vecs = [] | |
self.index2word = [] | |
self.word2index = {} | |
with gzip.open(self.path, "rt") as f: | |
for i, line in enumerate(f): | |
index = line.find(' ') | |
word = line[:index] | |
self.word2index[word] = i | |
self.index2word.append(word) | |
vec = np.loadtxt(io.StringIO(line[index:]), dtype=np.float32) | |
vec = vec / np.linalg.norm(vec) | |
vecs.append(vec) | |
vecs = np.vstack(vecs) | |
self.model_matrix = vecs | |
def sentence_to_vec(self, sentence): | |
indices = [self.word2index.get(word, None) for word in sentence.split(" ")] | |
num_words = 0 | |
vectors = np.zeros(self.model_matrix.shape[1], dtype = self.model_matrix.dtype) | |
for i in indices: | |
if i != None: | |
num_words += 1 | |
vectors += self.model_matrix[i] | |
if num_words > 0: | |
# take average and normalize | |
vectors = vectors / num_words | |
vectors = vectors / np.linalg.norm(vectors) | |
return vectors | |
def sentence_similarity(self, sentence_a, sentence_b): | |
return np.dot(self.sentence_to_vect(sentnence_a), self.sentence_to_vect(sentence_b)) | |
def most_similar_words(sentence, topn = 10): | |
vector = self.sentence_to_vec(sentence) | |
dists = np.dot(self.model_matrix, vector) | |
best = np.argsort(dists)[::-1][0:topn + 1] | |
words = [(self.index2word[k], k, dists[k]) for k in best] | |
return words | |
model = GloveModel("/Users/jonathanraiman/Desktop/glove_words.gz") | |
print(model.most_similar_words("take me to a chinese restaurant in 15 minutes")) | |
print(model.sentence_similarity("15", "100")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment