Last active
September 20, 2019 01:32
-
-
Save tmdavid/52e09956db6ab9ef2438f9144a12da89 to your computer and use it in GitHub Desktop.
Visualize word embeddings, using tsne.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Visualize word embeddings, using tsne. | |
First computes cosine distance of the 100 closests words, and then shows a clustering graph | |
of the first 11 closest words (the first one is always the word) | |
IT REQUIRES GLOVE MODEL.txt | |
line 31: glove_file = '../TBIR/glove.840B.300d.txt' MODIFY with the appropiate path | |
To Use it, you can just type: python word_embedding_vis.py <list of words space separated> | |
e.g: python word_embedding_vis.py cake word embedding music | |
""" | |
""" | |
check some glove words | |
""" | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sklearn.manifold import TSNE | |
from sys import stdout | |
import numpy as np | |
from matplotlib import pyplot | |
import sys | |
def build_glove_dictionary(): | |
""" | |
builds a dictionary based on the glove model. | |
http://nlp.stanford.edu/projects/glove/ | |
dictionary will have the form of key = token, value = numpy array with the pretrained values | |
REALLY IMPORTANT the glove dataset. with the big one finds nearly everything.... | |
smallest one...quite baaaaaad... | |
""" | |
print ('building glove dictionary...') | |
glove_file = '../TBIR/glove.840B.300d.txt' | |
glove_dict = {} | |
with open(glove_file) as fd_glove: | |
j=0 | |
for i, input in enumerate(fd_glove): | |
input_split = input.split(" ") | |
#print input_split | |
key = input_split[0] #get key | |
del input_split[0] # remove key | |
j+=1 | |
stdout.write("\rloading glove dictionary: %d" % j) | |
stdout.flush() | |
values = [] | |
for value in input_split: | |
values.append(float(value)) | |
np_values = np.asarray(values) | |
glove_dict[key] = np_values | |
#else: | |
#print key | |
print "" | |
print 'dictionary build with length', len(glove_dict) | |
return glove_dict | |
def build_glove_matrix(glove_dictionary): | |
""" | |
return word2idx and matrix | |
""" | |
idx2word = {} | |
glove_matrix = [] | |
i=0 | |
for key, value in glove_dictionary.iteritems(): | |
idx2word[i] = key | |
glove_matrix.append(value) | |
i+=1 | |
return np.asarray(glove_matrix), idx2word | |
def check_similarity(glove_matrix, word): | |
return cosine_similarity(word.reshape(1, -1), glove_matrix) | |
def build_matrix_to_tsne(glove_dict, tokens): | |
matrix = [] | |
for token in tokens: | |
if token in glove_dict: | |
matrix.append(glove_dict[token]) | |
return matrix | |
words = [] | |
if len(sys.argv)<2: | |
print 'Words not specified' | |
words = ["plant", "factory", "machine", "houseplant", "cake"] | |
else: | |
for i in range(1, len(sys.argv)): | |
words.append(sys.argv[i]) | |
print 'Words that will be used', words | |
glove_dict = build_glove_dictionary() | |
glove_matrix, idx2word = build_glove_matrix(glove_dict) | |
model = TSNE(n_components=2, random_state=0) | |
to_plot = [] | |
labels = [] | |
not_found = 0 | |
len_words = len(words) | |
for word in words: | |
try: | |
cosine_matrix = check_similarity(glove_matrix, glove_dict[word]) | |
ind = cosine_matrix[0].argsort()[-100:][::-1] | |
closest = ind.tolist() | |
tokens = [idx2word[idx] for idx in closest] | |
to_reduce = build_matrix_to_tsne(glove_dict, tokens) | |
#print to_reduce.shape | |
labels += [token for token in tokens] | |
to_plot += [x_y for x_y in to_reduce] | |
except: | |
len_words-=1 | |
print 'Word not found', word | |
print len_words | |
#print to_plot.shape | |
#print to_plot | |
X_hdim = np.array(to_plot) | |
#print X_hdim | |
print X_hdim.shape | |
X = model.fit_transform(X_hdim) | |
X_x = np.zeros((len_words*10, 2)) | |
labels_x = [] | |
print X.shape | |
k=0 | |
ranges = [x*100 for x in range (0, len_words)] | |
print ranges | |
for i in ranges: | |
for j in range(1, 11): | |
print i+j-1, k | |
X_x[k] = X[i+j-1] | |
k+=1 | |
labels[i+j-1] | |
labels_x.append(labels[i+j-1]) | |
print labels_x | |
print X_x.shape | |
pyplot.scatter(X_x[:,0],X_x[:,1]) | |
for i, label in enumerate(labels_x): | |
pyplot.annotate(label, (X_x[i,0],X_x[i,1])) | |
pyplot.show() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment