-
-
Save maxim5/c35ef2238ae708ccb0e55624e9e0252b to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from __future__ import print_function | |
__author__ = 'maxim' | |
import numpy as np | |
import gensim | |
import string | |
from keras.callbacks import LambdaCallback | |
from keras.layers.recurrent import LSTM | |
from keras.layers.embeddings import Embedding | |
from keras.layers import Dense, Activation | |
from keras.models import Sequential | |
from keras.utils.data_utils import get_file | |
print('\nFetching the text...') | |
url = 'https://raw.githubusercontent.com/maxim5/stanford-tensorflow-tutorials/master/data/arxiv_abstracts.txt' | |
path = get_file('arxiv_abstracts.txt', origin=url) | |
print('\nPreparing the sentences...') | |
max_sentence_len = 40 | |
with open(path) as file_: | |
docs = file_.readlines() | |
sentences = [[word for word in doc.lower().translate(None, string.punctuation).split()[:max_sentence_len]] for doc in docs] | |
print('Num sentences:', len(sentences)) | |
print('\nTraining word2vec...') | |
word_model = gensim.models.Word2Vec(sentences, size=100, min_count=1, window=5, iter=100) | |
pretrained_weights = word_model.wv.syn0 | |
vocab_size, emdedding_size = pretrained_weights.shape | |
print('Result embedding shape:', pretrained_weights.shape) | |
print('Checking similar words:') | |
for word in ['model', 'network', 'train', 'learn']: | |
most_similar = ', '.join('%s (%.2f)' % (similar, dist) for similar, dist in word_model.most_similar(word)[:8]) | |
print(' %s -> %s' % (word, most_similar)) | |
def word2idx(word): | |
return word_model.wv.vocab[word].index | |
def idx2word(idx): | |
return word_model.wv.index2word[idx] | |
print('\nPreparing the data for LSTM...') | |
train_x = np.zeros([len(sentences), max_sentence_len], dtype=np.int32) | |
train_y = np.zeros([len(sentences)], dtype=np.int32) | |
for i, sentence in enumerate(sentences): | |
for t, word in enumerate(sentence[:-1]): | |
train_x[i, t] = word2idx(word) | |
train_y[i] = word2idx(sentence[-1]) | |
print('train_x shape:', train_x.shape) | |
print('train_y shape:', train_y.shape) | |
print('\nTraining LSTM...') | |
model = Sequential() | |
model.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[pretrained_weights])) | |
model.add(LSTM(units=emdedding_size)) | |
model.add(Dense(units=vocab_size)) | |
model.add(Activation('softmax')) | |
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') | |
def sample(preds, temperature=1.0): | |
if temperature <= 0: | |
return np.argmax(preds) | |
preds = np.asarray(preds).astype('float64') | |
preds = np.log(preds) / temperature | |
exp_preds = np.exp(preds) | |
preds = exp_preds / np.sum(exp_preds) | |
probas = np.random.multinomial(1, preds, 1) | |
return np.argmax(probas) | |
def generate_next(text, num_generated=10): | |
word_idxs = [word2idx(word) for word in text.lower().split()] | |
for i in range(num_generated): | |
prediction = model.predict(x=np.array(word_idxs)) | |
idx = sample(prediction[-1], temperature=0.7) | |
word_idxs.append(idx) | |
return ' '.join(idx2word(idx) for idx in word_idxs) | |
def on_epoch_end(epoch, _): | |
print('\nGenerating text after epoch: %d' % epoch) | |
texts = [ | |
'deep convolutional', | |
'simple and effective', | |
'a nonconvex', | |
'a', | |
] | |
for text in texts: | |
sample = generate_next(text) | |
print('%s... -> %s' % (text, sample)) | |
model.fit(train_x, train_y, | |
batch_size=128, | |
epochs=20, | |
callbacks=[LambdaCallback(on_epoch_end=on_epoch_end)]) |
@maxim5 Can you please explain why the shape of the prediction is [40,1350] and not [1,1350]?
If you want the probablity of each word given the context why there are 40 rows?
Hi...have you found the answer??
@keineahnung2345, I was able to get the code working in Python 3.6 by using
s.translate(string.punctuation)
instead of
s.translate(None, string.punctuation)
@maxim5 Can you please explain why the shape of the prediction is [40,1350] and not [1,1350]?
If you want the probablity of each word given the context why there are 40 rows?Hi...have you found the answer??
It happens as he is returning the prediction over all possible timesteps. As the max_sentence_len
is 40, he is returning a prediction in a tensor of shape (max_sentence_len, vocabulary_size)
.
It usually happens when the LSTM layer argument return_sequences
is True
, not sure though why in this case is activated (haven't run the code), as the default is False
(https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/layers/LSTM).
Awesome work!
@anjali-123b to show the result, the script uses the terms for the selected data set (model, network, deep, ...). If you choose another dataset, those terms will likely be out-of-vocabulary. So you need to change them (line 36 and 83-88 in rev1).
Hi what are we supposed to replace these lists with? common words in our own dataset? Sorry I don't quite understand...
Thanks
Hi! What is the point of using one-hot vector on the prediction layer?
model.add(Dense(units=vocab_size)) model.add(Activation('softmax'))
It seems more suitable to use prediction of same embedding vector with Dense layer with linear activation.
Dense(emdedding_size, activation='linear')
Because if network outputs word Queen instead of King, gradient should be smaller, than output word Apple (in case of one-hot predictions these gradients would be the same)