Skip to content

Instantly share code, notes, and snippets.

@evanthebouncy
Created September 23, 2020 01:11
Show Gist options
  • Save evanthebouncy/132e101de4777d75c5b29c1739cc21bf to your computer and use it in GitHub Desktop.
Save evanthebouncy/132e101de4777d75c5b29c1739cc21bf to your computer and use it in GitHub Desktop.
get closeby words from w2v for small word set
import numpy as np
# open whichever w2v file you want
fd = open("glove.6B/glove.6B.50d.txt").readlines()
# return a list of keys (words) and the w2v matrix Nxd
def to_numpy(lines):
keys = []
ary = []
for l in lines:
vec = [float(x) for x in l.split(' ')[1:]]
keys.append(l.split(' ')[0])
ary.append(vec)
return keys, np.array(ary)
keys, w2v = to_numpy(fd)
# given an idx for a word (i.e. 1123)
# get the closest word' in w2v to a word
def get_closest(word_idx, ignore_idxs):
word_vec = w2v[word_idx]
dists = np.sum((w2v - word_vec)**2, axis=1)
# ignore distance to self, and also distance to ignore_idxs
dists[word_idx] = 999
dists[ignore_idxs] = 999
return np.argmin(dists)
# take a list of words, and "expand" or "grow" this list by
# going over w2v and getting the closest word for each w \in words
# return the extended list of words
def grow_by_1(list_of_word_idx, ban_words):
to_add = set()
for word_idx in list_of_word_idx:
new_idx = get_closest(word_idx, list(to_add) + list_of_word_idx + ban_words)
to_add.add(new_idx)
return to_add
if __name__ == '__main__':
# seed the word from the current arc label words would be sufficient
seed_words = ["geometry", "circle", "square", "wave", "panel", "line", "dot"]
seed_words_idx = [keys.index(x) for x in seed_words]
closeby_words = []
# pick any budget you want
while len(closeby_words) < 1000:
grow = grow_by_1(seed_words_idx, closeby_words)
closeby_words += list(grow)
print ("added these words ")
print ([keys[x] for x in grow])
# dump closeby_words to a file for later
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment