Skip to content

Instantly share code, notes, and snippets.

@kcarnold
Last active August 29, 2015 14:10
Show Gist options
  • Save kcarnold/68f62f6e180bd61170ca to your computer and use it in GitHub Desktop.
Save kcarnold/68f62f6e180bd61170ca to your computer and use it in GitHub Desktop.
Convert GloVe (http://www-nlp.stanford.edu/projects/glove/) pre-trained vectors to quick-lookup matrices
import numpy as np
import sys
def read_stored(num_dims, names_filename, data_filename):
import pandas as pd
names = pd.Index(line.strip() for line in open(names_filename))
num_terms = len(names)
data = np.memmap(data_filename, dtype=np.float32, mode='r', shape=(num_terms, num_dims))
return names, data
def main(in_file, num_terms, num_dims, names_filename, data_filename):
data = np.memmap(data_filename, dtype=np.float32, mode='w+', shape=(num_terms, num_dims))
name_file = open(names_filename, 'w')
for i, line in enumerate(in_file):
line = line.strip()
label, vec_as_text = line.split(' ', 1)
name_file.write(label + '\n')
vec = np.fromstring(vec_as_text, dtype=np.float32, sep=' ')
data[i] = vec
name_file.close()
if __name__ == '__main__':
try:
num_terms, num_dims, names_filename, data_filename = sys.argv[1:]
except:
sys.stderr.write("Usage:\ngzcat datafile | python glove_to_npy.py num_terms num_dims names_filename data_filename\n\n")
else:
main(sys.stdin, int(num_terms), int(num_dims), names_filename, data_filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment