Skip to content

Instantly share code, notes, and snippets.

@codekansas
Last active July 23, 2018 09:17
Show Gist options
  • Save codekansas/15b3c2a2e9bc7a3c345138a32e029969 to your computer and use it in GitHub Desktop.
Save codekansas/15b3c2a2e9bc7a3c345138a32e029969 to your computer and use it in GitHub Desktop.
Using Word2Vec embeddings in Keras models
from __future__ import print_function
import json
import os
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from keras.engine import Input
from keras.layers import Embedding, merge
from keras.models import Model
# tokenizer: can change this as needed
tokenize = lambda x: simple_preprocess(x)
def create_embeddings(data_dir,
embeddings_path='embeddings.npz',
vocab_path='map.json',
**params):
"""
Generate embeddings from a batch of text
:param embeddings_path: where to save the embeddings
:param vocab_path: where to save the word-index map
"""
class SentenceGenerator(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
for line in open(os.path.join(self.dirname, fname)):
yield tokenize(line)
sentences = SentenceGenerator(data_dir)
model = Word2Vec(sentences, **params)
weights = model.syn0
np.save(open(embeddings_path, 'wb'), weights)
vocab = dict([(k, v.index) for k, v in model.vocab.items()])
with open(vocab_path, 'w') as f:
f.write(json.dumps(vocab))
def load_vocab(vocab_path='map.json'):
"""
Load word -> index and index -> word mappings
:param vocab_path: where the word-index map is saved
:return: word2idx, idx2word
"""
with open(vocab_path, 'r') as f:
data = json.loads(f.read())
word2idx = data
idx2word = dict([(v, k) for k, v in data.items()])
return word2idx, idx2word
def word2vec_embedding_layer(embeddings_path='embeddings.npz'):
"""
Generate an embedding layer word2vec embeddings
:param embeddings_path: where the embeddings are saved (as a numpy file)
:return: the generated embedding layer
"""
weights = np.load(open(embeddings_path, 'rb'))
layer = Embedding(input_dim=weights.shape[0],
output_dim=weights.shape[1],
weights=[weights])
return layer
if __name__ == '__main__':
# specify embeddings in this environment variable
data_path = os.environ['EMBEDDINGS_TEXT_PATH']
# variable arguments are passed to gensim's word2vec model
create_embeddings(data_path, size=100, min_count=5,
window=5, sg=1, iter=25)
word2idx, idx2word = load_vocab()
# cosine similarity model
input_a = Input(shape=(1,), dtype='int32', name='input_a')
input_b = Input(shape=(1,), dtype='int32', name='input_b')
embeddings = word2vec_embedding_layer()
embedding_a = embeddings(input_a)
embedding_b = embeddings(input_b)
similarity = merge([embedding_a, embedding_b],
mode='cos', dot_axes=2)
model = Model(input=[input_a, input_b], output=[similarity])
model.compile(optimizer='sgd', loss='mse')
while True:
word_a = raw_input('First word: ')
if word_a not in word2idx:
print('Word "%s" is not in the index' % word_a)
continue
word_b = raw_input('Second word: ')
if word_b not in word2idx:
print('Word "%s" is not in the index' % word_b)
continue
output = model.predict([np.asarray([word2idx[word_a]]),
np.asarray([word2idx[word_b]])])
print(output)
@RC-Jay
Copy link

RC-Jay commented Mar 7, 2017

File "ker.py", line 37, in create_embeddings
weights = model.syn0
AttributeError: 'Word2Vec' object has no attribute 'syn0'

Ok, So I am passing a data directory path containing some txt files. I am getting this error. How should i proceed?

@jbrry
Copy link

jbrry commented Jul 26, 2017

@RC-Jay, try change weights = model.syn0 to weights = model.wv.syn0

If that doesn't work there may be older versions of gensim code which may need to be updated. Most of the updated code examples can be found here:

https://groups.google.com/forum/embed/#!topic/gensim/hlYgjqEVocw

@alaakh42
Copy link

alaakh42 commented Oct 9, 2017

@RC-Jay, it is a probelm with your gensim package version
try:
weights = model.wv.syn0

@rdmtinez
Copy link

rdmtinez commented Jul 23, 2018

Should I have trained my Word2Vec model such that the tokens'<UNK>' ,'<GO>', '<EOC>' were also learned ...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment