viksit · November 22, 2015 23:55
diff --git a/glove.py b/glove.py
 #!/usr/bin/python
 # load glove
 import codecs
 import array
 import collections
 import io
 try:
    # Python 2 compat
    import cPickle as pickle
 except ImportError:
    import pickle

 import numpy as np
 import scipy.sparse as sp
 import plac



 # Loading glove model from stanford
 def load_stanford(filename):
    """
    Load model from the output files generated by
    the C code from http://nlp.stanford.edu/projects/glove/.
    The entries of the word dictionary will be of type
    unicode in Python 2 and str in Python 3.
    """

    dct = {}
    vectors = array.array('d')

    # Read in the data.
    with codecs.open(filename, 'r', encoding='ISO-8859-1') as savefile:
        for i, line in enumerate(savefile):
            tokens = line.split(' ')

            word = tokens[0]
            entries = tokens[1:]

            dct[word] = i
            vectors.extend(float(x) for x in entries)
            # Infer word vectors dimensions.

        no_components = len(entries)
        no_vectors = len(dct)
        print "Corpus stats: ", no_components, no_vectors

        # Make these into numpy arrays
        word_vecs = np.array(vectors).reshape(no_vectors, no_components)
        print word_vecs.shape
        print word_vecs[:5]
        inverse_dictionary = {v: k for k, v in dct.items()}
        return (word_vecs, dct, inverse_dictionary)


 def similarity_query(word_vectors, word_vec, number, inverse_dictionary):
    dst = (np.dot(word_vectors, word_vec)
            / np.linalg.norm(word_vectors, axis=1)
            / np.linalg.norm(word_vec))
    word_ids = np.argsort(-dst)

    return [(inverse_dictionary[x], dst[x]) for x in word_ids[:number]
            if x in inverse_dictionary]


 def main(filename, word):
    word_vecs, dct, inverse_dct = load_stanford(filename)
    word_idx = dct[word]
    print similarity_query(word_vecs, word_vecs[word_idx], 10, inverse_dct)

 if __name__ == "__main__":
    plac.call(main)
	#!/usr/bin/python
	# load glove
	import codecs
	import array
	import collections
	import io
	try:
	# Python 2 compat
	import cPickle as pickle
	except ImportError:
	import pickle

	import numpy as np
	import scipy.sparse as sp
	import plac



	# Loading glove model from stanford
	def load_stanford(filename):
	"""
	Load model from the output files generated by
	the C code from http://nlp.stanford.edu/projects/glove/.
	The entries of the word dictionary will be of type
	unicode in Python 2 and str in Python 3.
	"""

	dct = {}
	vectors = array.array('d')

	# Read in the data.
	with codecs.open(filename, 'r', encoding='ISO-8859-1') as savefile:
	for i, line in enumerate(savefile):
	tokens = line.split(' ')

	word = tokens[0]
	entries = tokens[1:]

	dct[word] = i
	vectors.extend(float(x) for x in entries)
	# Infer word vectors dimensions.

	no_components = len(entries)
	no_vectors = len(dct)
	print "Corpus stats: ", no_components, no_vectors

	# Make these into numpy arrays
	word_vecs = np.array(vectors).reshape(no_vectors, no_components)
	print word_vecs.shape
	print word_vecs[:5]
	inverse_dictionary = {v: k for k, v in dct.items()}
	return (word_vecs, dct, inverse_dictionary)


	def similarity_query(word_vectors, word_vec, number, inverse_dictionary):
	dst = (np.dot(word_vectors, word_vec)
	/ np.linalg.norm(word_vectors, axis=1)
	/ np.linalg.norm(word_vec))
	word_ids = np.argsort(-dst)

	return [(inverse_dictionary[x], dst[x]) for x in word_ids[:number]
	if x in inverse_dictionary]


	def main(filename, word):
	word_vecs, dct, inverse_dct = load_stanford(filename)
	word_idx = dct[word]
	print similarity_query(word_vecs, word_vecs[word_idx], 10, inverse_dct)

	if __name__ == "__main__":
	plac.call(main)