-
-
Save udibr/67be473cf053d8c38730 to your computer and use it in GitHub Desktop.
# variation to https://github.com/ryankiros/skip-thoughts/blob/master/decoding/search.py | |
def keras_rnn_predict(samples, empty=empty, rnn_model=model, maxlen=maxlen): | |
"""for every sample, calculate probability for every possible label | |
you need to supply your RNN model and maxlen - the length of sequences it can handle | |
""" | |
data = sequence.pad_sequences(samples, maxlen=maxlen, value=empty) | |
return rnn_model.predict(data, verbose=0) | |
def beamsearch(predict=keras_rnn_predict, | |
k=1, maxsample=400, use_unk=False, oov=oov, empty=empty, eos=eos): | |
"""return k samples (beams) and their NLL scores, each sample is a sequence of labels, | |
all samples starts with an `empty` label and end with `eos` or truncated to length of `maxsample`. | |
You need to supply `predict` which returns the label probability of each sample. | |
`use_unk` allow usage of `oov` (out-of-vocabulary) label in samples | |
""" | |
dead_k = 0 # samples that reached eos | |
dead_samples = [] | |
dead_scores = [] | |
live_k = 1 # samples that did not yet reached eos | |
live_samples = [[empty]] | |
live_scores = [0] | |
while live_k and dead_k < k: | |
# for every possible live sample calc prob for every possible label | |
probs = predict(live_samples, empty=empty) | |
# total score for every sample is sum of -log of word prb | |
cand_scores = np.array(live_scores)[:,None] - np.log(probs) | |
if not use_unk and oov is not None: | |
cand_scores[:,oov] = 1e20 | |
cand_flat = cand_scores.flatten() | |
# find the best (lowest) scores we have from all possible samples and new words | |
ranks_flat = cand_flat.argsort()[:(k-dead_k)] | |
live_scores = cand_flat[ranks_flat] | |
# append the new words to their appropriate live sample | |
voc_size = probs.shape[1] | |
live_samples = [live_samples[r//voc_size]+[r%voc_size] for r in ranks_flat] | |
# live samples that should be dead are... | |
zombie = [s[-1] == eos or len(s) >= maxsample for s in live_samples] | |
# add zombies to the dead | |
dead_samples += [s for s,z in zip(live_samples,zombie) if z] # remove first label == empty | |
dead_scores += [s for s,z in zip(live_scores,zombie) if z] | |
dead_k = len(dead_samples) | |
# remove zombies from the living | |
live_samples = [s for s,z in zip(live_samples,zombie) if not z] | |
live_scores = [s for s,z in zip(live_scores,zombie) if not z] | |
live_k = len(live_samples) | |
return dead_samples + live_samples, dead_scores + live_scores |
I am currently working on an image captioning task for which i have set up a VGG image model and a LSTM language model and merged them together. Here is my model
`
Setting up VGG-16
image_model = Sequential()
image_model.add(ZeroPadding2D((1,1),input_shape=(3,224,224)))
image_model.add(Convolution2D(64, 3, 3, activation='relu'))
image_model.add(ZeroPadding2D((1,1)))
image_model.add(Convolution2D(64, 3, 3, activation='relu'))
image_model.add(MaxPooling2D((2,2), strides=(2,2)))
image_model.add(ZeroPadding2D((1,1)))
image_model.add(Convolution2D(128, 3, 3, activation='relu'))
image_model.add(ZeroPadding2D((1,1)))
image_model.add(Convolution2D(128, 3, 3, activation='relu'))
image_model.add(MaxPooling2D((2,2), strides=(2,2)))
image_model.add(ZeroPadding2D((1,1)))
image_model.add(Convolution2D(256, 3, 3, activation='relu'))
image_model.add(ZeroPadding2D((1,1)))
image_model.add(Convolution2D(256, 3, 3, activation='relu'))
image_model.add(ZeroPadding2D((1,1)))
image_model.add(Convolution2D(256, 3, 3, activation='relu'))
image_model.add(MaxPooling2D((2,2), strides=(2,2)))
image_model.add(ZeroPadding2D((1,1)))
image_model.add(Convolution2D(512, 3, 3, activation='relu'))
image_model.add(ZeroPadding2D((1,1)))
image_model.add(Convolution2D(512, 3, 3, activation='relu'))
image_model.add(ZeroPadding2D((1,1)))
image_model.add(Convolution2D(512, 3, 3, activation='relu'))
image_model.add(MaxPooling2D((2,2), strides=(2,2)))
image_model.add(ZeroPadding2D((1,1)))
image_model.add(Convolution2D(512, 3, 3, activation='relu'))
image_model.add(ZeroPadding2D((1,1)))
image_model.add(Convolution2D(512, 3, 3, activation='relu'))
image_model.add(ZeroPadding2D((1,1)))
image_model.add(Convolution2D(512, 3, 3, activation='relu'))
image_model.add(MaxPooling2D((2,2), strides=(2,2)))
image_model.add(Flatten())
image_model.add(Dense(4096, activation='relu'))
image_model.add(Dropout(0.5))
image_model.add(Dense(4096, activation='relu'))
Loading the pre-trained weights here.
Setting the final model
image_model = Sequential()
image_model.add(Dense(1000, input_dim=4096, activation="linear"))
bb_model = Sequential()
bb_model.add(Dense(1000, input_dim=4096, activation="linear"))
temp_image_model = Sequential()
temp_image_model.add(Merge([image_model, bb_model], mode='concat', concat_axis=-1))
spatial_model = Sequential()
spatial_model.add(Dense(5, input_dim=5, activation="linear"))
net_image_model = Sequential()
net_image_model.add(Merge([temp_image_model, spatial_model], mode='concat', concat_axis=-1))
image_model.add(RepeatVector(40))
print "Image model made."
vocab_size = 10000
max_caption_len = 40
print "Preparing Language Model."
language_model = Sequential()
language_model.add(Embedding(vocab_size,512, input_length=max_caption_len))
language_model.add(LSTM(output_dim=512, return_sequences=True))
language_model.add(TimeDistributed(Dense(512)))
print "Language Model set."
print "Merging Language and Image Model"
full_model = Sequential()
full_model.add(Merge([image_model, language_model], mode='concat', concat_axis=-1))
full_model.add(LSTM(512, return_sequences=False))
full_model.add(Dense(10000))
full_model.add(Activation('softmax'))
print "Models merged..."`
This is my model and I have trained it on partial captions to predict the next word but I am not able to get an output using full_model.predict()
Could you let me know how to integrate your beamsearch here ?
Hi @jeetp465, have you solve your problem? How did you solve it? I am encountering the same situation. Thanks.
Interesting naming conventions
@jeetp465 is your issue resolved ?
@jeetp465, @andersonzhu, According to my understanding, Beam search is not the part of model definition. It is The way how we decode the output of LSTM(RNN). So, BeamSearch is used where you are generating words from the predicted output received from LSTM. So The output of full_model.predict() will be passed to BeamSearch to get output captions.
0 0 android
0 1 adreno