Skip to content

Instantly share code, notes, and snippets.

@georgepar
Last active October 31, 2018 11:44
Show Gist options
  • Save georgepar/7d1fd391f182024bca48983ad4bf12c2 to your computer and use it in GitHub Desktop.
Save georgepar/7d1fd391f182024bca48983ad4bf12c2 to your computer and use it in GitHub Desktop.
word2vec
# Initialize word2vec. Context is taken as the 2 previous and 2 next words
model = Word2Vec(sentences, window=5, size=100, workers=4)
model.train(sentences, total_examples=len(sentences), epochs=1000)
# get ordered vocabulary list
voc = model.wv.index2word
# get vector size
dim = model.vector_size
# get most similar words
sim = model.wv.most_similar('holmes')
# Convert to numpy 2d array (n_vocab x vector_size)
def to_embeddings_Matrix(model):
embedding_matrix = np.zeros((len(model.wv.vocab), model.vector_size))
word2idx = {}
for i in range(len(model.wv.vocab)):
embedding_matrix[i] = model.wv[model.wv.index2word[i]]
word2idx[model.wv.index2word[i]] = i
return embedding_matrix, model.wv.index2word, word2idx
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment