Created
September 23, 2020 01:11
-
-
Save evanthebouncy/132e101de4777d75c5b29c1739cc21bf to your computer and use it in GitHub Desktop.
get closeby words from w2v for small word set
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
# open whichever w2v file you want | |
fd = open("glove.6B/glove.6B.50d.txt").readlines() | |
# return a list of keys (words) and the w2v matrix Nxd | |
def to_numpy(lines): | |
keys = [] | |
ary = [] | |
for l in lines: | |
vec = [float(x) for x in l.split(' ')[1:]] | |
keys.append(l.split(' ')[0]) | |
ary.append(vec) | |
return keys, np.array(ary) | |
keys, w2v = to_numpy(fd) | |
# given an idx for a word (i.e. 1123) | |
# get the closest word' in w2v to a word | |
def get_closest(word_idx, ignore_idxs): | |
word_vec = w2v[word_idx] | |
dists = np.sum((w2v - word_vec)**2, axis=1) | |
# ignore distance to self, and also distance to ignore_idxs | |
dists[word_idx] = 999 | |
dists[ignore_idxs] = 999 | |
return np.argmin(dists) | |
# take a list of words, and "expand" or "grow" this list by | |
# going over w2v and getting the closest word for each w \in words | |
# return the extended list of words | |
def grow_by_1(list_of_word_idx, ban_words): | |
to_add = set() | |
for word_idx in list_of_word_idx: | |
new_idx = get_closest(word_idx, list(to_add) + list_of_word_idx + ban_words) | |
to_add.add(new_idx) | |
return to_add | |
if __name__ == '__main__': | |
# seed the word from the current arc label words would be sufficient | |
seed_words = ["geometry", "circle", "square", "wave", "panel", "line", "dot"] | |
seed_words_idx = [keys.index(x) for x in seed_words] | |
closeby_words = [] | |
# pick any budget you want | |
while len(closeby_words) < 1000: | |
grow = grow_by_1(seed_words_idx, closeby_words) | |
closeby_words += list(grow) | |
print ("added these words ") | |
print ([keys[x] for x in grow]) | |
# dump closeby_words to a file for later |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment