Last active
July 23, 2018 09:17
-
-
Save codekansas/15b3c2a2e9bc7a3c345138a32e029969 to your computer and use it in GitHub Desktop.
Using Word2Vec embeddings in Keras models
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
import json | |
import os | |
import numpy as np | |
from gensim.models import Word2Vec | |
from gensim.utils import simple_preprocess | |
from keras.engine import Input | |
from keras.layers import Embedding, merge | |
from keras.models import Model | |
# tokenizer: can change this as needed | |
tokenize = lambda x: simple_preprocess(x) | |
def create_embeddings(data_dir, | |
embeddings_path='embeddings.npz', | |
vocab_path='map.json', | |
**params): | |
""" | |
Generate embeddings from a batch of text | |
:param embeddings_path: where to save the embeddings | |
:param vocab_path: where to save the word-index map | |
""" | |
class SentenceGenerator(object): | |
def __init__(self, dirname): | |
self.dirname = dirname | |
def __iter__(self): | |
for fname in os.listdir(self.dirname): | |
for line in open(os.path.join(self.dirname, fname)): | |
yield tokenize(line) | |
sentences = SentenceGenerator(data_dir) | |
model = Word2Vec(sentences, **params) | |
weights = model.syn0 | |
np.save(open(embeddings_path, 'wb'), weights) | |
vocab = dict([(k, v.index) for k, v in model.vocab.items()]) | |
with open(vocab_path, 'w') as f: | |
f.write(json.dumps(vocab)) | |
def load_vocab(vocab_path='map.json'): | |
""" | |
Load word -> index and index -> word mappings | |
:param vocab_path: where the word-index map is saved | |
:return: word2idx, idx2word | |
""" | |
with open(vocab_path, 'r') as f: | |
data = json.loads(f.read()) | |
word2idx = data | |
idx2word = dict([(v, k) for k, v in data.items()]) | |
return word2idx, idx2word | |
def word2vec_embedding_layer(embeddings_path='embeddings.npz'): | |
""" | |
Generate an embedding layer word2vec embeddings | |
:param embeddings_path: where the embeddings are saved (as a numpy file) | |
:return: the generated embedding layer | |
""" | |
weights = np.load(open(embeddings_path, 'rb')) | |
layer = Embedding(input_dim=weights.shape[0], | |
output_dim=weights.shape[1], | |
weights=[weights]) | |
return layer | |
if __name__ == '__main__': | |
# specify embeddings in this environment variable | |
data_path = os.environ['EMBEDDINGS_TEXT_PATH'] | |
# variable arguments are passed to gensim's word2vec model | |
create_embeddings(data_path, size=100, min_count=5, | |
window=5, sg=1, iter=25) | |
word2idx, idx2word = load_vocab() | |
# cosine similarity model | |
input_a = Input(shape=(1,), dtype='int32', name='input_a') | |
input_b = Input(shape=(1,), dtype='int32', name='input_b') | |
embeddings = word2vec_embedding_layer() | |
embedding_a = embeddings(input_a) | |
embedding_b = embeddings(input_b) | |
similarity = merge([embedding_a, embedding_b], | |
mode='cos', dot_axes=2) | |
model = Model(input=[input_a, input_b], output=[similarity]) | |
model.compile(optimizer='sgd', loss='mse') | |
while True: | |
word_a = raw_input('First word: ') | |
if word_a not in word2idx: | |
print('Word "%s" is not in the index' % word_a) | |
continue | |
word_b = raw_input('Second word: ') | |
if word_b not in word2idx: | |
print('Word "%s" is not in the index' % word_b) | |
continue | |
output = model.predict([np.asarray([word2idx[word_a]]), | |
np.asarray([word2idx[word_b]])]) | |
print(output) |
@RC-Jay, try change weights = model.syn0 to weights = model.wv.syn0
If that doesn't work there may be older versions of gensim code which may need to be updated. Most of the updated code examples can be found here:
https://groups.google.com/forum/embed/#!topic/gensim/hlYgjqEVocw
@RC-Jay, it is a probelm with your gensim package version
try:
weights = model.wv.syn0
Should I have trained my Word2Vec model such that the tokens'<UNK>' ,'<GO>', '<EOC>'
were also learned ...
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
File "ker.py", line 37, in create_embeddings
weights = model.syn0
AttributeError: 'Word2Vec' object has no attribute 'syn0'
Ok, So I am passing a data directory path containing some txt files. I am getting this error. How should i proceed?