Created
June 27, 2018 20:39
-
-
Save hesoyamcode/00c36065a00d3404a2c0f7d720615a88 to your computer and use it in GitHub Desktop.
Word2Vec
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gensim.models.word2vec import Word2Vec | |
from sklearn.manifold import TSNE | |
from sklearn.datasets import fetch_20newsgroups | |
import re | |
import matplotlib.pyplot as plt | |
# download example data ( may take a while) | |
train = fetch_20newsgroups() | |
def clean(text): | |
"""Remove posting header, split by sentences and words, keep only letters""" | |
lines = re.split('[?!.:]\s', re.sub('^.*Lines: \d+', '', re.sub('\n', ' ', text))) | |
return [re.sub('[^a-zA-Z]', ' ', line).lower().split() for line in lines] | |
sentences = [line for text in train.data for line in clean(text)] | |
model = Word2Vec(sentences, workers=4, size=100, min_count=50, window=10, sample=1e-3) | |
print (model.most_similar('memory')) | |
X = model[model.wv.vocab] | |
tsne = TSNE(n_components=2) | |
X_tsne = tsne.fit_transform(X) | |
plt.scatter(X_tsne[:, 0], X_tsne[:, 1]) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment