Skip to content

Instantly share code, notes, and snippets.

@viksit
Created November 22, 2015 23:55
Show Gist options
  • Save viksit/1e76b18b9e471eabd042 to your computer and use it in GitHub Desktop.
Save viksit/1e76b18b9e471eabd042 to your computer and use it in GitHub Desktop.
Python code to load and run a similarity query against a pre-trained set of glove vectors
#!/usr/bin/python
# load glove
import codecs
import array
import collections
import io
try:
# Python 2 compat
import cPickle as pickle
except ImportError:
import pickle
import numpy as np
import scipy.sparse as sp
import plac
# Loading glove model from stanford
def load_stanford(filename):
"""
Load model from the output files generated by
the C code from http://nlp.stanford.edu/projects/glove/.
The entries of the word dictionary will be of type
unicode in Python 2 and str in Python 3.
"""
dct = {}
vectors = array.array('d')
# Read in the data.
with codecs.open(filename, 'r', encoding='ISO-8859-1') as savefile:
for i, line in enumerate(savefile):
tokens = line.split(' ')
word = tokens[0]
entries = tokens[1:]
dct[word] = i
vectors.extend(float(x) for x in entries)
# Infer word vectors dimensions.
no_components = len(entries)
no_vectors = len(dct)
print "Corpus stats: ", no_components, no_vectors
# Make these into numpy arrays
word_vecs = np.array(vectors).reshape(no_vectors, no_components)
print word_vecs.shape
print word_vecs[:5]
inverse_dictionary = {v: k for k, v in dct.items()}
return (word_vecs, dct, inverse_dictionary)
def similarity_query(word_vectors, word_vec, number, inverse_dictionary):
dst = (np.dot(word_vectors, word_vec)
/ np.linalg.norm(word_vectors, axis=1)
/ np.linalg.norm(word_vec))
word_ids = np.argsort(-dst)
return [(inverse_dictionary[x], dst[x]) for x in word_ids[:number]
if x in inverse_dictionary]
def main(filename, word):
word_vecs, dct, inverse_dct = load_stanford(filename)
word_idx = dct[word]
print similarity_query(word_vecs, word_vecs[word_idx], 10, inverse_dct)
if __name__ == "__main__":
plac.call(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment