Created
December 10, 2020 20:31
-
-
Save mmahbub/c6e402c9d10e116546d9c6c977f169e4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def tsnescatterplot(model, word, list_names, size): | |
| """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word, | |
| its list of most similar words, and a list of words. | |
| """ | |
| arrays = np.empty((0, size), dtype='f') | |
| word_labels = [word] | |
| color_list = ['red'] | |
| # adds the vector of the query word | |
| arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0) | |
| # gets list of most similar words | |
| close_words = model.wv.most_similar([word]) | |
| # adds the vector for each of the closest words to the array | |
| for wrd_score in close_words: | |
| wrd_vector = model.wv.__getitem__([wrd_score[0]]) | |
| word_labels.append(wrd_score[0]) | |
| color_list.append('blue') | |
| arrays = np.append(arrays, wrd_vector, axis=0) | |
| # adds the vector for each of the words from list_names to the array | |
| for wrd in list_names: | |
| wrd_vector = model.wv.__getitem__([wrd]) | |
| word_labels.append(wrd) | |
| color_list.append('green') | |
| arrays = np.append(arrays, wrd_vector, axis=0) | |
| # Reduces the dimensionality from 300 or 50 to 21 dimensions with PCA | |
| reduc = PCA(n_components=21).fit_transform(arrays) | |
| # Finds t-SNE coordinates for 2 dimensions | |
| np.set_printoptions(suppress=True) | |
| Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc) | |
| # Sets everything up to plot | |
| df = pd.DataFrame({'x': [x for x in Y[:, 0]], | |
| 'y': [y for y in Y[:, 1]], | |
| 'words': word_labels, | |
| 'color': color_list}) | |
| fig, _ = plt.subplots() | |
| fig.set_size_inches(9, 9) | |
| # Basic plot | |
| p1 = sns.regplot(data=df, | |
| x="x", | |
| y="y", | |
| fit_reg=False, | |
| marker="o", | |
| scatter_kws={'s': 40, | |
| 'facecolors': df['color'] | |
| } | |
| ) | |
| # Adds annotations one by one with a loop | |
| for line in range(0, df.shape[0]): | |
| p1.text(df["x"][line], | |
| df['y'][line], | |
| ' ' + df["words"][line].title(), | |
| horizontalalignment='left', | |
| verticalalignment='bottom', size='medium', | |
| color=df['color'][line], | |
| weight='normal' | |
| ).set_size(15) | |
| plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50) | |
| plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50) | |
| plt.title('t-SNE visualization for {}'.format(word.title())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment