|
# The network is used for the video description experiments of S2VT [1]. |
|
# Please consider citing S2VT [1] if you use this example in your work. |
|
# |
|
# [1] S. Venugopalan, M. Rohrbach, J. Donahue, R. Mooney, T. Darrell, |
|
# K. Saenko. "Sequence to Sequence - Video to Text." ICCV 2015. |
|
|
|
# The data is prepared using framefc7_stream_text_to_hdf5.py |
|
# It is in (32) parallel streams. |
|
name: "s2vt" |
|
layer { |
|
name: "data" |
|
type: "HDF5Data" |
|
top: "cont_sentence" |
|
top: "input_sentence" |
|
top: "target_sentence" |
|
top: "stage_indicator" |
|
top: "frame_fc7" |
|
include { phase: TRAIN } |
|
hdf5_data_param { |
|
source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt" |
|
batch_size: 80 |
|
} |
|
} |
|
layer { |
|
name: "data" |
|
type: "HDF5Data" |
|
top: "cont_sentence" |
|
top: "input_sentence" |
|
top: "target_sentence" |
|
top: "stage_indicator" |
|
top: "frame_fc7" |
|
include { |
|
phase: TEST |
|
stage: "test-on-train" |
|
} |
|
hdf5_data_param { |
|
source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt" |
|
batch_size: 80 |
|
} |
|
} |
|
layer { |
|
name: "data" |
|
type: "HDF5Data" |
|
top: "cont_sentence" |
|
top: "input_sentence" |
|
top: "target_sentence" |
|
top: "stage_indicator" |
|
top: "frame_fc7" |
|
include { |
|
phase: TEST |
|
stage: "test-on-val" |
|
} |
|
hdf5_data_param { |
|
source: "./hdf5/buffer_32_s2vt_80/valid_batches/hdf5_chunk_list.txt" |
|
batch_size: 80 |
|
} |
|
} |
|
layer { |
|
name: "reshape_stg_indicator" # from 80 32 to 80 32 1 to concat |
|
type: "Reshape" |
|
bottom: "stage_indicator" |
|
top: "stage_indicator_3axis" |
|
reshape_param { |
|
shape { |
|
dim: 80 |
|
dim: 32 |
|
dim: 1 |
|
} |
|
} |
|
} |
|
layer { |
|
name: "dropFc7" |
|
type: "Dropout" |
|
bottom: "frame_fc7" |
|
top: "frame_fc7" |
|
dropout_param { dropout_ratio: 0.5 } |
|
include { stage: "dropFc7" } |
|
} |
|
layer { |
|
name: "embed_encoder" |
|
type: "InnerProduct" |
|
bottom: "frame_fc7" |
|
top: "embedded_input_frames" |
|
param { |
|
lr_mult: 1 |
|
decay_mult: 1 |
|
} |
|
param { |
|
lr_mult: 2 |
|
decay_mult: 0 |
|
} |
|
inner_product_param { |
|
num_output: 500 |
|
weight_filler { |
|
type: "xavier" |
|
} |
|
bias_filler { |
|
type: "constant" |
|
value: 0.2 |
|
} |
|
axis: -1 |
|
} |
|
} |
|
layer { |
|
name: "embedding" |
|
type: "Embed" |
|
bottom: "input_sentence" |
|
top: "embedded_input_sentence" |
|
param { |
|
lr_mult: 1 |
|
} |
|
embed_param { |
|
bias_term: false |
|
input_dim: 46168 #youtube_movie_vocab+1 |
|
num_output: 500 |
|
weight_filler { |
|
type: "uniform" |
|
min: -0.08 |
|
max: 0.08 |
|
} |
|
} |
|
} |
|
layer { |
|
name: "drop_input_en" |
|
type: "Dropout" |
|
bottom: "embedded_input_sentence" |
|
top: "embedded_input_sentence" |
|
dropout_param { dropout_ratio: 0.5 } |
|
include { stage: "dropEn" } |
|
} |
|
# unfactored model concat frames and sents |
|
layer { |
|
name: "concat" |
|
type: "Concat" |
|
concat_param { concat_dim: 2 } # concat along h |
|
bottom: "embedded_input_frames" |
|
bottom: "embedded_input_sentence" |
|
bottom: "stage_indicator_3axis" |
|
top: "embedded_input_video_sequence" |
|
include { stage: "unfactored" } |
|
} |
|
layer { |
|
name: "lstm1" |
|
type: "LSTM" |
|
bottom: "embedded_input_video_sequence" |
|
bottom: "cont_sentence" |
|
top: "lstm1" |
|
include { stage: "unfactored" } |
|
recurrent_param { |
|
num_output: 1000 |
|
weight_filler { |
|
type: "uniform" |
|
min: -0.08 |
|
max: 0.08 |
|
} |
|
bias_filler { |
|
type: "constant" |
|
value: 0 |
|
} |
|
} |
|
} |
|
layer { |
|
name: "lstm2" |
|
type: "LSTM" |
|
bottom: "lstm1" |
|
bottom: "cont_sentence" |
|
top: "lstm2" |
|
include { |
|
stage: "unfactored" |
|
stage: "2-layer" |
|
} |
|
recurrent_param { |
|
num_output: 1000 |
|
weight_filler { |
|
type: "uniform" |
|
min: -0.08 |
|
max: 0.08 |
|
} |
|
bias_filler { |
|
type: "constant" |
|
value: 0 |
|
} |
|
} |
|
} |
|
layer { |
|
name: "lstm1" |
|
type: "LSTM" |
|
bottom: "embedded_input_frames" |
|
bottom: "cont_sentence" |
|
top: "lstm1" |
|
include { stage: "factored" } |
|
recurrent_param { |
|
num_output: 1000 |
|
weight_filler { |
|
type: "uniform" |
|
min: -0.08 |
|
max: 0.08 |
|
} |
|
bias_filler { |
|
type: "constant" |
|
value: 0 |
|
} |
|
} |
|
} |
|
layer { |
|
name: "drop_lstm1" |
|
type: "Dropout" |
|
bottom: "lstm1" |
|
top: "lstm1" |
|
dropout_param { dropout_ratio: 0.5 } |
|
include { stage: "dropLstm1" } |
|
} |
|
layer { |
|
name: "concat" |
|
type: "Concat" |
|
concat_param { concat_dim: 2 } # concat along h |
|
bottom: "lstm1" |
|
bottom: "embedded_input_sentence" |
|
bottom: "stage_indicator_3axis" |
|
top: "lstm1_video_sequence" |
|
include { stage: "factored" } |
|
} |
|
layer { |
|
name: "lstm2" |
|
type: "LSTM" |
|
bottom: "lstm1_video_sequence" |
|
bottom: "cont_sentence" |
|
top: "lstm2" |
|
include { stage: "factored" } |
|
recurrent_param { |
|
num_output: 1000 |
|
weight_filler { |
|
type: "uniform" |
|
min: -0.08 |
|
max: 0.08 |
|
} |
|
bias_filler { |
|
type: "constant" |
|
value: 0 |
|
} |
|
} |
|
} |
|
layer { |
|
name: "drop_lstm2" |
|
type: "Dropout" |
|
bottom: "lstm2" |
|
top: "lstm2" |
|
dropout_param { dropout_ratio: 0.5 } |
|
include { stage: "dropLstm2" } |
|
} |
|
layer { |
|
name: "predict" |
|
type: "InnerProduct" |
|
bottom: "lstm1" |
|
top: "predict" |
|
param { |
|
lr_mult: 1 |
|
decay_mult: 1 |
|
} |
|
param { |
|
lr_mult: 2 |
|
decay_mult: 0 |
|
} |
|
exclude { stage: "2-layer" } |
|
inner_product_param { |
|
num_output: 46168 # youtube_movie_vocab + 1 |
|
weight_filler { |
|
type: "uniform" |
|
min: -0.08 |
|
max: 0.08 |
|
} |
|
bias_filler { |
|
type: "constant" |
|
value: 0 |
|
} |
|
axis: 2 |
|
} |
|
} |
|
layer { |
|
name: "predict" |
|
type: "InnerProduct" |
|
bottom: "lstm2" |
|
top: "predict" |
|
param { |
|
lr_mult: 1 |
|
decay_mult: 1 |
|
} |
|
param { |
|
lr_mult: 2 |
|
decay_mult: 0 |
|
} |
|
include { stage: "2-layer" } |
|
inner_product_param { |
|
num_output: 46168 # youtube_movie_vocab + 1 |
|
weight_filler { |
|
type: "uniform" |
|
min: -0.08 |
|
max: 0.08 |
|
} |
|
bias_filler { |
|
type: "constant" |
|
value: 0 |
|
} |
|
axis: 2 |
|
} |
|
} |
|
layer { |
|
name: "cross_entropy_loss" |
|
type: "SoftmaxWithLoss" |
|
bottom: "predict" |
|
bottom: "target_sentence" |
|
top: "cross_entropy_loss" |
|
loss_weight: 20 |
|
loss_param { |
|
ignore_label: -1 |
|
} |
|
softmax_param { |
|
axis: 2 |
|
} |
|
} |
|
layer { |
|
name: "accuracy" |
|
type: "Accuracy" |
|
bottom: "predict" |
|
bottom: "target_sentence" |
|
top: "accuracy" |
|
include { phase: TEST } |
|
accuracy_param { |
|
axis: 2 |
|
ignore_label: -1 |
|
} |
|
} |