Last active
June 2, 2022 18:09
-
-
Save Steboss89/ce87fbf00e31250634fe3741dd1c6e81 to your computer and use it in GitHub Desktop.
T-SNE function for word embeddings
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def get_word_frequencies(text): | |
| r""" This function return a Counter with the most common words | |
| in a given text | |
| Parameters | |
| ---------- | |
| text: df['text'].tolist() | |
| Return | |
| ------ | |
| freq: Counter, most common words with their freqs | |
| """ | |
| frequencies = Counter() | |
| tokens = [nltk.word_tokenize(sentence) for sentence in text] | |
| for token in tokens: | |
| for word in token: | |
| frequencies[word] += 1 | |
| freq = frequencies.most_common() | |
| return freq | |
| def most_similar(input_word, num_similar): | |
| r""" This function uses word2vec to find the most | |
| num_similar similar words wrt a given | |
| input_word | |
| Parameters | |
| ----------- | |
| input_word: str, input word | |
| num_similar: int, how many similar words we want to get | |
| Return | |
| ------ | |
| output: list, input word and found words | |
| """ | |
| sim = word2vec.wv.most_similar(input_word, topn=num_similar) | |
| output = [] | |
| found = [] | |
| for item in sim: | |
| w, n = item | |
| found.append(w) | |
| output = [input_word, found] | |
| return output | |
| def calculate_t_sne(word2vec): | |
| r""" Main function to copmute the t-sne representation | |
| of the computed word2vec | |
| """ | |
| vocab = word2vec.wv.vocab.keys() | |
| vocab_len = len(vocab) | |
| dim0 = word2vec.wv[vocab].shape[1] | |
| arr = np.empty((0, dim0), dtype='f') | |
| labels = [] | |
| vectors_file = os.path.join(save_dir, "vocab_vectors.npy") | |
| labels_file = os.path.join(save_dir, "labels.json") | |
| print("Creating an array of vectors for each word in the vocab") | |
| for count, word in enumerate(vocab): | |
| if count % 50 == 0: | |
| print_progress(count, vocab_len) | |
| w_vec = word2vec[word] | |
| labels.append(word) | |
| arr = np.append(arr, np.array([w_vec]), axis=0) | |
| save_bin(arr, vectors_file) | |
| save_json(labels, labels_file) | |
| x_coords = None | |
| y_coords = None | |
| x_c_filename = os.path.join(save_dir, "x_coords.npy") | |
| y_c_filename = os.path.join(save_dir, "y_coords.npy") | |
| print("Computing T-SNE for array of length: " + str(len(arr))) | |
| tsne = TSNE(n_components=2, random_state=1, verbose=1) | |
| np.set_printoptions(suppress=True) | |
| Y = tsne.fit_transform(arr) | |
| x_coords = Y[:, 0] | |
| y_coords = Y[:, 1] | |
| print("Saving coords.") | |
| save_bin(x_coords, x_c_filename) | |
| save_bin(y_coords, y_c_filename) | |
| return x_coords, y_coords, labels, arr | |
| def t_sne_scatterplot(word): | |
| r""" Function to plot the t-sne result for a given word | |
| Parameters | |
| ---------- | |
| word: list, given word we want to plot the w2v-tsne plot + its neighbours | |
| """ | |
| vocab = word2vec.wv.vocab.keys() | |
| vocab_len = len(vocab) | |
| vocab_elems = [key for key in vocab] | |
| dim0 = word2vec.wv[vocab_elems[0]].shape[0] | |
| arr = np.empty((0, dim0), dtype='f') | |
| w_labels = [word] | |
| # check all the similar words around | |
| nearby = word2vec.wv.similar_by_word(word, topn=num_similar) | |
| arr = np.append(arr, np.array([word2vec[word]]), axis=0) | |
| for n in nearby: | |
| w_vec = word2vec[n[0]] | |
| w_labels.append(n[0]) | |
| arr = np.append(arr, np.array([w_vec]), axis=0) | |
| tsne = TSNE(n_components=2, random_state=1) | |
| np.set_printoptions(suppress=True) | |
| Y = tsne.fit_transform(arr) | |
| x_coords = Y[:, 0] | |
| y_coords = Y[:, 1] | |
| plt.rc("font", size=16) | |
| plt.figure(figsize=(16, 12), dpi=80) | |
| plt.scatter(x_coords[0], y_coords[0], s=800, marker="o", color="blue") | |
| plt.scatter(x_coords[1:], y_coords[1:], s=200, marker="o", color="red") | |
| for label, x, y in zip(w_labels, x_coords, y_coords): | |
| plt.annotate(label.upper(), xy=(x, y), xytext=(0, 0), textcoords='offset points') | |
| plt.xlim(x_coords.min()-50, x_coords.max()+50) | |
| plt.ylim(y_coords.min()-50, y_coords.max()+50) | |
| filename = os.path.join(plot_dir, word + "_tsne.png") | |
| plt.savefig(filename) | |
| plt.close() | |
| def test_word2vec(test_words): | |
| r""" Function to check if a test word exists within our vocabulary | |
| and return the word along with its most similar, thorugh word | |
| embeddings | |
| Parameters | |
| ---------- | |
| test_words: str, given word to check | |
| Return | |
| ------ | |
| output: list, input word and associated words | |
| """ | |
| vocab = word2vec.wv.vocab.keys() | |
| vocab_len = len(vocab) | |
| output = [] | |
| associations = {} | |
| test_items = test_words | |
| for count, word in enumerate(test_items): | |
| if word in vocab: | |
| print("[" + str(count+1) + "] Testing: " + word) | |
| if word not in associations: | |
| associations[word] = [] | |
| similar = most_similar(word, num_similar) | |
| t_sne_scatterplot(word) | |
| output.append(similar) | |
| for s in similar[1]: | |
| if s not in associations[word]: | |
| associations[word].append(s) | |
| else: | |
| print("Word " + word + " not in vocab") | |
| filename = os.path.join(save_dir, "word2vec_test.json") | |
| save_json(output, filename) | |
| filename = os.path.join(save_dir, "associations.json") | |
| save_json(associations, filename) | |
| filename = os.path.join(save_dir, "associations.csv") | |
| handle = io.open(filename, "w", encoding="utf-8") | |
| handle.write(u"Source,Target\n") | |
| for w, sim in associations.items(): | |
| for s in sim: | |
| handle.write(w + u"," + s + u"\n") | |
| return output | |
| def show_cluster_locations(results, labels, x_coords, y_coords): | |
| r""" function to retrieve the cluster location from t-sne | |
| Parameters | |
| ---------- | |
| results: list, word and its neighbours | |
| labels: words | |
| x_coords, y_coords: float, x-y coordinates 2D plane | |
| """ | |
| for item in results: | |
| name = item[0] | |
| print("Plotting graph for " + name) | |
| similar = item[1] | |
| in_set_x = [] | |
| in_set_y = [] | |
| out_set_x = [] | |
| out_set_y = [] | |
| name_x = 0 | |
| name_y = 0 | |
| for count, word in enumerate(labels): | |
| xc = x_coords[count] | |
| yc = y_coords[count] | |
| if word == name: | |
| name_x = xc | |
| name_y = yc | |
| elif word in similar: | |
| in_set_x.append(xc) | |
| in_set_y.append(yc) | |
| else: | |
| out_set_x.append(xc) | |
| out_set_y.append(yc) | |
| plt.figure(figsize=(16, 12), dpi=80) | |
| plt.scatter(name_x, name_y, s=400, marker="o", c="blue") | |
| plt.scatter(in_set_x, in_set_y, s=80, marker="o", c="red") | |
| plt.scatter(out_set_x, out_set_y, s=8, marker=".", c="black") | |
| filename = os.path.join(big_plot_dir, name + "_tsne.png") | |
| plt.savefig(filename) | |
| plt.close() | |
| x_coords, y_coords, labels, arr = calculate_t_sne(word2vec) | |
| # and let's save the t-sne plots with the words clusters | |
| frequencies = get_word_frequencies(df['text'].tolist()) | |
| # check the first 50 most frequent words and see if they're in the w2v | |
| for item in frequencies[:50]: | |
| test_words.append(item[0]) | |
| results = test_word2vec(test_words) | |
| # and once we have all the word + neighbors let's see how the t-sne has grouped them | |
| show_cluster_locations(results, labels, x_coords, y_coords) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment