evanthebouncy · September 23, 2020 01:11
diff --git a/get_close.py b/get_close.py
 import numpy as np

 # open whichever w2v file you want
 fd = open("glove.6B/glove.6B.50d.txt").readlines()

 # return a list of keys (words) and the w2v matrix Nxd
 def to_numpy(lines):
    keys = []
    ary = []
    for l in lines:
        vec = [float(x) for x in l.split(' ')[1:]]
        keys.append(l.split(' ')[0])
        ary.append(vec)
    return keys, np.array(ary)

 keys, w2v = to_numpy(fd)

 # given an idx for a word (i.e. 1123)
 # get the closest word' in w2v to a word
 def get_closest(word_idx, ignore_idxs):
    word_vec = w2v[word_idx]
    dists = np.sum((w2v - word_vec)**2, axis=1)
    # ignore distance to self, and also distance to ignore_idxs
    dists[word_idx] = 999
    dists[ignore_idxs] = 999
    return np.argmin(dists)


 # take a list of words, and "expand" or "grow" this list by
 # going over w2v and getting the closest word for each w \in words
 # return the extended list of words
 def grow_by_1(list_of_word_idx, ban_words):
    to_add = set()
    for word_idx in list_of_word_idx:
        new_idx = get_closest(word_idx, list(to_add) + list_of_word_idx + ban_words)
        to_add.add(new_idx)
    return to_add

 if __name__ == '__main__':
    # seed the word from the current arc label words would be sufficient
    seed_words = ["geometry", "circle", "square", "wave", "panel", "line", "dot"]

    seed_words_idx = [keys.index(x) for x in seed_words]
    closeby_words = []
    # pick any budget you want
    while len(closeby_words) < 1000:
        grow = grow_by_1(seed_words_idx, closeby_words)
        closeby_words += list(grow)
        print ("added these words ")
        print ([keys[x] for x in grow])

    # dump closeby_words to a file for later
	import numpy as np

	# open whichever w2v file you want
	fd = open("glove.6B/glove.6B.50d.txt").readlines()

	# return a list of keys (words) and the w2v matrix Nxd
	def to_numpy(lines):
	keys = []
	ary = []
	for l in lines:
	vec = [float(x) for x in l.split(' ')[1:]]
	keys.append(l.split(' ')[0])
	ary.append(vec)
	return keys, np.array(ary)

	keys, w2v = to_numpy(fd)

	# given an idx for a word (i.e. 1123)
	# get the closest word' in w2v to a word
	def get_closest(word_idx, ignore_idxs):
	word_vec = w2v[word_idx]
	dists = np.sum((w2v - word_vec)**2, axis=1)
	# ignore distance to self, and also distance to ignore_idxs
	dists[word_idx] = 999
	dists[ignore_idxs] = 999
	return np.argmin(dists)


	# take a list of words, and "expand" or "grow" this list by
	# going over w2v and getting the closest word for each w \in words
	# return the extended list of words
	def grow_by_1(list_of_word_idx, ban_words):
	to_add = set()
	for word_idx in list_of_word_idx:
	new_idx = get_closest(word_idx, list(to_add) + list_of_word_idx + ban_words)
	to_add.add(new_idx)
	return to_add

	if __name__ == '__main__':
	# seed the word from the current arc label words would be sufficient
	seed_words = ["geometry", "circle", "square", "wave", "panel", "line", "dot"]

	seed_words_idx = [keys.index(x) for x in seed_words]
	closeby_words = []
	# pick any budget you want
	while len(closeby_words) < 1000:
	grow = grow_by_1(seed_words_idx, closeby_words)
	closeby_words += list(grow)
	print ("added these words ")
	print ([keys[x] for x in grow])

	# dump closeby_words to a file for later