-
-
Save geekan/6eeab1c3719b53ca60f71e91e7934a2e to your computer and use it in GitHub Desktop.
Using Word2Vec embeddings in Keras models
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
import json | |
import os | |
import numpy as np | |
from gensim.models import Word2Vec | |
from gensim.utils import simple_preprocess | |
from keras.engine import Input | |
from keras.layers import Embedding, merge | |
from keras.models import Model | |
# tokenizer: can change this as needed | |
tokenize = lambda x: simple_preprocess(x) | |
def create_embeddings(data_dir, embeddings_path='embeddings.npz', vocab_path='map.json', **params): | |
""" | |
Generate embeddings from a batch of text | |
:param embeddings_path: where to save the embeddings | |
:param vocab_path: where to save the word-index map | |
""" | |
class SentenceGenerator(object): | |
def __init__(self, dirname): | |
self.dirname = dirname | |
def __iter__(self): | |
for fname in os.listdir(self.dirname): | |
for line in open(os.path.join(self.dirname, fname)): | |
yield tokenize(line) | |
sentences = SentenceGenerator(data_dir) | |
model = Word2Vec(sentences, **params) | |
weights = model.syn0 | |
np.save(open(embeddings_path, 'wb'), weights) | |
vocab = dict([(k, v.index) for k, v in model.vocab.items()]) | |
with open(vocab_path, 'w') as f: | |
f.write(json.dumps(vocab)) | |
def load_vocab(vocab_path='map.json'): | |
""" | |
Load word -> index and index -> word mappings | |
:param vocab_path: where the word-index map is saved | |
:return: word2idx, idx2word | |
""" | |
with open(vocab_path, 'r') as f: | |
data = json.loads(f.read()) | |
word2idx = data | |
idx2word = dict([(v, k) for k, v in data.items()]) | |
return word2idx, idx2word | |
def word2vec_embedding_layer(embeddings_path='embeddings.npz'): | |
""" | |
Generate an embedding layer word2vec embeddings | |
:param embeddings_path: where the embeddings are saved (as a numpy file) | |
:return: the generated embedding layer | |
""" | |
weights = np.load(open(embeddings_path, 'rb')) | |
layer = Embedding(input_dim=weights.shape[0], output_dim=weights.shape[1], weights=[weights]) | |
return layer | |
if __name__ == '__main__': | |
# specify embeddings in this environment variable | |
data_path = os.environ['EMBEDDINGS_TEXT_PATH'] | |
# variable arguments are passed to gensim's word2vec model | |
create_embeddings(data_path, size=100, min_count=5, window=5, sg=1, iter=25) | |
word2idx, idx2word = load_vocab() | |
# cosine similarity model | |
input_a = Input(shape=(1,), dtype='int32', name='input_a') | |
input_b = Input(shape=(1,), dtype='int32', name='input_b') | |
embeddings = word2vec_embedding_layer() | |
embedding_a = embeddings(input_a) | |
embedding_b = embeddings(input_b) | |
similarity = merge([embedding_a, embedding_b], mode='cos', dot_axes=2) | |
model = Model(input=[input_a, input_b], output=[similarity]) | |
model.compile(optimizer='sgd', loss='mse') # optimizer and loss don't matter for this example | |
while True: | |
word_a = raw_input('First word: ') | |
if word_a not in word2idx: | |
print('Word "%s" is not in the index' % word_a) | |
continue | |
word_b = raw_input('Second word: ') | |
if word_b not in word2idx: | |
print('Word "%s" is not in the index' % word_b) | |
continue | |
output = model.predict([np.asarray([word2idx[word_a]]), np.asarray([word2idx[word_b]])]) | |
print(output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment