Skip to content

Instantly share code, notes, and snippets.

@Steboss89
Last active June 2, 2022 18:09
Show Gist options
  • Save Steboss89/ce87fbf00e31250634fe3741dd1c6e81 to your computer and use it in GitHub Desktop.
Save Steboss89/ce87fbf00e31250634fe3741dd1c6e81 to your computer and use it in GitHub Desktop.
T-SNE function for word embeddings
def get_word_frequencies(text):
r""" This function return a Counter with the most common words
in a given text
Parameters
----------
text: df['text'].tolist()
Return
------
freq: Counter, most common words with their freqs
"""
frequencies = Counter()
tokens = [nltk.word_tokenize(sentence) for sentence in text]
for token in tokens:
for word in token:
frequencies[word] += 1
freq = frequencies.most_common()
return freq
def most_similar(input_word, num_similar):
r""" This function uses word2vec to find the most
num_similar similar words wrt a given
input_word
Parameters
-----------
input_word: str, input word
num_similar: int, how many similar words we want to get
Return
------
output: list, input word and found words
"""
sim = word2vec.wv.most_similar(input_word, topn=num_similar)
output = []
found = []
for item in sim:
w, n = item
found.append(w)
output = [input_word, found]
return output
def calculate_t_sne(word2vec):
r""" Main function to copmute the t-sne representation
of the computed word2vec
"""
vocab = word2vec.wv.vocab.keys()
vocab_len = len(vocab)
dim0 = word2vec.wv[vocab].shape[1]
arr = np.empty((0, dim0), dtype='f')
labels = []
vectors_file = os.path.join(save_dir, "vocab_vectors.npy")
labels_file = os.path.join(save_dir, "labels.json")
print("Creating an array of vectors for each word in the vocab")
for count, word in enumerate(vocab):
if count % 50 == 0:
print_progress(count, vocab_len)
w_vec = word2vec[word]
labels.append(word)
arr = np.append(arr, np.array([w_vec]), axis=0)
save_bin(arr, vectors_file)
save_json(labels, labels_file)
x_coords = None
y_coords = None
x_c_filename = os.path.join(save_dir, "x_coords.npy")
y_c_filename = os.path.join(save_dir, "y_coords.npy")
print("Computing T-SNE for array of length: " + str(len(arr)))
tsne = TSNE(n_components=2, random_state=1, verbose=1)
np.set_printoptions(suppress=True)
Y = tsne.fit_transform(arr)
x_coords = Y[:, 0]
y_coords = Y[:, 1]
print("Saving coords.")
save_bin(x_coords, x_c_filename)
save_bin(y_coords, y_c_filename)
return x_coords, y_coords, labels, arr
def t_sne_scatterplot(word):
r""" Function to plot the t-sne result for a given word
Parameters
----------
word: list, given word we want to plot the w2v-tsne plot + its neighbours
"""
vocab = word2vec.wv.vocab.keys()
vocab_len = len(vocab)
vocab_elems = [key for key in vocab]
dim0 = word2vec.wv[vocab_elems[0]].shape[0]
arr = np.empty((0, dim0), dtype='f')
w_labels = [word]
# check all the similar words around
nearby = word2vec.wv.similar_by_word(word, topn=num_similar)
arr = np.append(arr, np.array([word2vec[word]]), axis=0)
for n in nearby:
w_vec = word2vec[n[0]]
w_labels.append(n[0])
arr = np.append(arr, np.array([w_vec]), axis=0)
tsne = TSNE(n_components=2, random_state=1)
np.set_printoptions(suppress=True)
Y = tsne.fit_transform(arr)
x_coords = Y[:, 0]
y_coords = Y[:, 1]
plt.rc("font", size=16)
plt.figure(figsize=(16, 12), dpi=80)
plt.scatter(x_coords[0], y_coords[0], s=800, marker="o", color="blue")
plt.scatter(x_coords[1:], y_coords[1:], s=200, marker="o", color="red")
for label, x, y in zip(w_labels, x_coords, y_coords):
plt.annotate(label.upper(), xy=(x, y), xytext=(0, 0), textcoords='offset points')
plt.xlim(x_coords.min()-50, x_coords.max()+50)
plt.ylim(y_coords.min()-50, y_coords.max()+50)
filename = os.path.join(plot_dir, word + "_tsne.png")
plt.savefig(filename)
plt.close()
def test_word2vec(test_words):
r""" Function to check if a test word exists within our vocabulary
and return the word along with its most similar, thorugh word
embeddings
Parameters
----------
test_words: str, given word to check
Return
------
output: list, input word and associated words
"""
vocab = word2vec.wv.vocab.keys()
vocab_len = len(vocab)
output = []
associations = {}
test_items = test_words
for count, word in enumerate(test_items):
if word in vocab:
print("[" + str(count+1) + "] Testing: " + word)
if word not in associations:
associations[word] = []
similar = most_similar(word, num_similar)
t_sne_scatterplot(word)
output.append(similar)
for s in similar[1]:
if s not in associations[word]:
associations[word].append(s)
else:
print("Word " + word + " not in vocab")
filename = os.path.join(save_dir, "word2vec_test.json")
save_json(output, filename)
filename = os.path.join(save_dir, "associations.json")
save_json(associations, filename)
filename = os.path.join(save_dir, "associations.csv")
handle = io.open(filename, "w", encoding="utf-8")
handle.write(u"Source,Target\n")
for w, sim in associations.items():
for s in sim:
handle.write(w + u"," + s + u"\n")
return output
def show_cluster_locations(results, labels, x_coords, y_coords):
r""" function to retrieve the cluster location from t-sne
Parameters
----------
results: list, word and its neighbours
labels: words
x_coords, y_coords: float, x-y coordinates 2D plane
"""
for item in results:
name = item[0]
print("Plotting graph for " + name)
similar = item[1]
in_set_x = []
in_set_y = []
out_set_x = []
out_set_y = []
name_x = 0
name_y = 0
for count, word in enumerate(labels):
xc = x_coords[count]
yc = y_coords[count]
if word == name:
name_x = xc
name_y = yc
elif word in similar:
in_set_x.append(xc)
in_set_y.append(yc)
else:
out_set_x.append(xc)
out_set_y.append(yc)
plt.figure(figsize=(16, 12), dpi=80)
plt.scatter(name_x, name_y, s=400, marker="o", c="blue")
plt.scatter(in_set_x, in_set_y, s=80, marker="o", c="red")
plt.scatter(out_set_x, out_set_y, s=8, marker=".", c="black")
filename = os.path.join(big_plot_dir, name + "_tsne.png")
plt.savefig(filename)
plt.close()
x_coords, y_coords, labels, arr = calculate_t_sne(word2vec)
# and let's save the t-sne plots with the words clusters
frequencies = get_word_frequencies(df['text'].tolist())
# check the first 50 most frequent words and see if they're in the w2v
for item in frequencies[:50]:
test_words.append(item[0])
results = test_word2vec(test_words)
# and once we have all the word + neighbors let's see how the t-sne has grouped them
show_cluster_locations(results, labels, x_coords, y_coords)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment