Created
November 22, 2015 23:55
-
-
Save viksit/1e76b18b9e471eabd042 to your computer and use it in GitHub Desktop.
Python code to load and run a similarity query against a pre-trained set of glove vectors
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# load glove | |
import codecs | |
import array | |
import collections | |
import io | |
try: | |
# Python 2 compat | |
import cPickle as pickle | |
except ImportError: | |
import pickle | |
import numpy as np | |
import scipy.sparse as sp | |
import plac | |
# Loading glove model from stanford | |
def load_stanford(filename): | |
""" | |
Load model from the output files generated by | |
the C code from http://nlp.stanford.edu/projects/glove/. | |
The entries of the word dictionary will be of type | |
unicode in Python 2 and str in Python 3. | |
""" | |
dct = {} | |
vectors = array.array('d') | |
# Read in the data. | |
with codecs.open(filename, 'r', encoding='ISO-8859-1') as savefile: | |
for i, line in enumerate(savefile): | |
tokens = line.split(' ') | |
word = tokens[0] | |
entries = tokens[1:] | |
dct[word] = i | |
vectors.extend(float(x) for x in entries) | |
# Infer word vectors dimensions. | |
no_components = len(entries) | |
no_vectors = len(dct) | |
print "Corpus stats: ", no_components, no_vectors | |
# Make these into numpy arrays | |
word_vecs = np.array(vectors).reshape(no_vectors, no_components) | |
print word_vecs.shape | |
print word_vecs[:5] | |
inverse_dictionary = {v: k for k, v in dct.items()} | |
return (word_vecs, dct, inverse_dictionary) | |
def similarity_query(word_vectors, word_vec, number, inverse_dictionary): | |
dst = (np.dot(word_vectors, word_vec) | |
/ np.linalg.norm(word_vectors, axis=1) | |
/ np.linalg.norm(word_vec)) | |
word_ids = np.argsort(-dst) | |
return [(inverse_dictionary[x], dst[x]) for x in word_ids[:number] | |
if x in inverse_dictionary] | |
def main(filename, word): | |
word_vecs, dct, inverse_dct = load_stanford(filename) | |
word_idx = dct[word] | |
print similarity_query(word_vecs, word_vecs[word_idx], 10, inverse_dct) | |
if __name__ == "__main__": | |
plac.call(main) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment