vsubhashini · January 24, 2023 21:43
diff --git a/readme.md b/readme.md
diff --git a/s2vt.prototxt b/s2vt.prototxt
 # The network is used for the video description experiments of S2VT [1].
 # Please consider citing S2VT [1] if you use this example in your work.
 #
 # [1] S. Venugopalan, M. Rohrbach, J. Donahue, R. Mooney, T. Darrell,
 #     K. Saenko. "Sequence to Sequence - Video to Text." ICCV 2015.

 # The data is prepared using framefc7_stream_text_to_hdf5.py
 # It is in (32)  parallel streams.
 name: "s2vt"
 layer {
  name: "data"
  type: "HDF5Data"
  top: "cont_sentence"
  top: "input_sentence"
  top: "target_sentence"
  top: "stage_indicator"
  top: "frame_fc7"
  include { phase: TRAIN }
  hdf5_data_param {
    source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt"
    batch_size: 80
  }
 }
 layer {
  name: "data"
  type: "HDF5Data"
  top: "cont_sentence"
  top: "input_sentence"
  top: "target_sentence"
  top: "stage_indicator"
  top: "frame_fc7"
  include {
    phase: TEST
    stage: "test-on-train"
  }
  hdf5_data_param {
    source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt"
    batch_size: 80
  }
 }
 layer {
  name: "data"
  type: "HDF5Data"
  top: "cont_sentence"
  top: "input_sentence"
  top: "target_sentence"
  top: "stage_indicator"
  top: "frame_fc7"
  include {
    phase: TEST
    stage: "test-on-val"
  }
  hdf5_data_param {
    source: "./hdf5/buffer_32_s2vt_80/valid_batches/hdf5_chunk_list.txt"
    batch_size: 80
  }
 }
 layer {
  name: "reshape_stg_indicator" # from 80 32 to 80 32 1 to concat
  type: "Reshape"
  bottom: "stage_indicator"
  top: "stage_indicator_3axis"
  reshape_param {
    shape {
      dim: 80
      dim: 32
      dim: 1
    }
  }
 }
 layer {
  name: "dropFc7"
  type: "Dropout"
  bottom: "frame_fc7"
  top: "frame_fc7"
  dropout_param { dropout_ratio: 0.5 }
  include { stage: "dropFc7" }
 }
 layer {
  name: "embed_encoder"
  type: "InnerProduct"
  bottom: "frame_fc7"
  top: "embedded_input_frames"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 500
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0.2
    }
    axis: -1
  }
 }
 layer {
  name: "embedding"
  type: "Embed"
  bottom: "input_sentence"
  top: "embedded_input_sentence"
  param {
    lr_mult: 1
  }
  embed_param {
    bias_term: false
    input_dim: 46168 #youtube_movie_vocab+1
    num_output: 500
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
  }
 }
 layer {
  name: "drop_input_en"
  type: "Dropout"
  bottom: "embedded_input_sentence"
  top: "embedded_input_sentence"
  dropout_param { dropout_ratio: 0.5 }
  include { stage: "dropEn" }
 }
 # unfactored model concat frames and sents
 layer {
  name: "concat"
  type: "Concat"
  concat_param { concat_dim: 2 } # concat along h
  bottom: "embedded_input_frames"
  bottom: "embedded_input_sentence"
  bottom: "stage_indicator_3axis"
  top: "embedded_input_video_sequence"
  include { stage: "unfactored" }
 }
 layer {
  name: "lstm1"
  type: "LSTM"
  bottom: "embedded_input_video_sequence"
  bottom: "cont_sentence"
  top: "lstm1"
  include { stage: "unfactored" }
  recurrent_param {
    num_output: 1000
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layer {
  name: "lstm2"
  type: "LSTM"
  bottom: "lstm1"
  bottom: "cont_sentence"
  top: "lstm2"
  include {
    stage: "unfactored"
    stage: "2-layer"
  }
  recurrent_param {
    num_output: 1000
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layer {
  name: "lstm1"
  type: "LSTM"
  bottom: "embedded_input_frames"
  bottom: "cont_sentence"
  top: "lstm1"
  include { stage: "factored" }
  recurrent_param {
    num_output: 1000
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layer {
  name: "drop_lstm1"
  type: "Dropout"
  bottom: "lstm1"
  top: "lstm1"
  dropout_param { dropout_ratio: 0.5 }
  include { stage: "dropLstm1" }
 }
 layer {
  name: "concat"
  type: "Concat"
  concat_param { concat_dim: 2 } # concat along h
  bottom: "lstm1"
  bottom: "embedded_input_sentence"
  bottom: "stage_indicator_3axis"
  top: "lstm1_video_sequence"
  include { stage: "factored" }
 }
 layer {
  name: "lstm2"
  type: "LSTM"
  bottom: "lstm1_video_sequence"
  bottom: "cont_sentence"
  top: "lstm2"
  include { stage: "factored" }
  recurrent_param {
    num_output: 1000
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layer {
  name: "drop_lstm2"
  type: "Dropout"
  bottom: "lstm2"
  top: "lstm2"
  dropout_param { dropout_ratio: 0.5 }
  include { stage: "dropLstm2" }
 }
 layer {
  name: "predict"
  type: "InnerProduct"
  bottom: "lstm1"
  top: "predict"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  exclude { stage: "2-layer" }
  inner_product_param {
    num_output: 46168 # youtube_movie_vocab + 1
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
    axis: 2
  }
 }
 layer {
  name: "predict"
  type: "InnerProduct"
  bottom: "lstm2"
  top: "predict"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  include { stage: "2-layer" }
  inner_product_param {
    num_output: 46168 # youtube_movie_vocab + 1
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
    axis: 2
  }
 }
 layer {
  name: "cross_entropy_loss"
  type: "SoftmaxWithLoss"
  bottom: "predict"
  bottom: "target_sentence"
  top: "cross_entropy_loss"
  loss_weight: 20
  loss_param {
    ignore_label: -1
  }
  softmax_param {
    axis: 2
  }
 }
 layer {
  name: "accuracy"
  type: "Accuracy"
  bottom: "predict"
  bottom: "target_sentence"
  top: "accuracy"
  include { phase: TEST }
  accuracy_param {
    axis: 2
    ignore_label: -1
  }
 }
diff --git a/s2vt_solver.prototxt b/s2vt_solver.prototxt
 net: "./s2vt.prototxt"

 # s2vt.prototxt supports multiple sequence to sequence architectures:
 # (1) stage: 'factored' stage: '2-layer'
 # (2) stage: 'unfactored' stage: '1-layer'
 # (3) stage: 'unfactored' stage: '2-layer'
 # Addons:
 # (a) stage: 'dropFc7'   [input frame feature dropout]
 # (b) stage: 'dropEn'    [text feature dropout after embedding]
 # (c) stage: 'dropLstm1' [dropout on output of lstm1]
 # (d) stage: 'dropLstm2' [dropout on output of lstm2]
 #
 # This solver uses variant (1) which performed best on the youtube dataset.
 #
 # To use a different variant, modify the states (train_state, test_state)
 # below as appropriate:

 train_state: { stage: 'factored' stage: '2-layer' }
 test_iter: 25
 test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-train' }
 test_iter: 25
 test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-val' }
 test_interval: 1000
 base_lr: 0.01
 lr_policy: "step"
 gamma: 0.5
 stepsize: 20000
 display: 1
 max_iter: 18000
 momentum: 0.9
 weight_decay: 0.0000
 snapshot: 1000
 snapshot_prefix: "./snapshots/s2vt_youtube_vgg"
 solver_mode: GPU
 random_seed: 1701
 average_loss: 100
 clip_gradients: 10
	# The network is used for the video description experiments of S2VT [1].
	# Please consider citing S2VT [1] if you use this example in your work.
	#
	# [1] S. Venugopalan, M. Rohrbach, J. Donahue, R. Mooney, T. Darrell,
	# K. Saenko. "Sequence to Sequence - Video to Text." ICCV 2015.

	# The data is prepared using framefc7_stream_text_to_hdf5.py
	# It is in (32) parallel streams.
	name: "s2vt"
	layer {
	name: "data"
	type: "HDF5Data"
	top: "cont_sentence"
	top: "input_sentence"
	top: "target_sentence"
	top: "stage_indicator"
	top: "frame_fc7"
	include { phase: TRAIN }
	hdf5_data_param {
	source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt"
	batch_size: 80
	}
	}
	layer {
	name: "data"
	type: "HDF5Data"
	top: "cont_sentence"
	top: "input_sentence"
	top: "target_sentence"
	top: "stage_indicator"
	top: "frame_fc7"
	include {
	phase: TEST
	stage: "test-on-train"
	}
	hdf5_data_param {
	source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt"
	batch_size: 80
	}
	}
	layer {
	name: "data"
	type: "HDF5Data"
	top: "cont_sentence"
	top: "input_sentence"
	top: "target_sentence"
	top: "stage_indicator"
	top: "frame_fc7"
	include {
	phase: TEST
	stage: "test-on-val"
	}
	hdf5_data_param {
	source: "./hdf5/buffer_32_s2vt_80/valid_batches/hdf5_chunk_list.txt"
	batch_size: 80
	}
	}
	layer {
	name: "reshape_stg_indicator" # from 80 32 to 80 32 1 to concat
	type: "Reshape"
	bottom: "stage_indicator"
	top: "stage_indicator_3axis"
	reshape_param {
	shape {
	dim: 80
	dim: 32
	dim: 1
	}
	}
	}
	layer {
	name: "dropFc7"
	type: "Dropout"
	bottom: "frame_fc7"
	top: "frame_fc7"
	dropout_param { dropout_ratio: 0.5 }
	include { stage: "dropFc7" }
	}
	layer {
	name: "embed_encoder"
	type: "InnerProduct"
	bottom: "frame_fc7"
	top: "embedded_input_frames"
	param {
	lr_mult: 1
	decay_mult: 1
	}
	param {
	lr_mult: 2
	decay_mult: 0
	}
	inner_product_param {
	num_output: 500
	weight_filler {
	type: "xavier"
	}
	bias_filler {
	type: "constant"
	value: 0.2
	}
	axis: -1
	}
	}
	layer {
	name: "embedding"
	type: "Embed"
	bottom: "input_sentence"
	top: "embedded_input_sentence"
	param {
	lr_mult: 1
	}
	embed_param {
	bias_term: false
	input_dim: 46168 #youtube_movie_vocab+1
	num_output: 500
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	}
	}
	layer {
	name: "drop_input_en"
	type: "Dropout"
	bottom: "embedded_input_sentence"
	top: "embedded_input_sentence"
	dropout_param { dropout_ratio: 0.5 }
	include { stage: "dropEn" }
	}
	# unfactored model concat frames and sents
	layer {
	name: "concat"
	type: "Concat"
	concat_param { concat_dim: 2 } # concat along h
	bottom: "embedded_input_frames"
	bottom: "embedded_input_sentence"
	bottom: "stage_indicator_3axis"
	top: "embedded_input_video_sequence"
	include { stage: "unfactored" }
	}
	layer {
	name: "lstm1"
	type: "LSTM"
	bottom: "embedded_input_video_sequence"
	bottom: "cont_sentence"
	top: "lstm1"
	include { stage: "unfactored" }
	recurrent_param {
	num_output: 1000
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	}
	}
	layer {
	name: "lstm2"
	type: "LSTM"
	bottom: "lstm1"
	bottom: "cont_sentence"
	top: "lstm2"
	include {
	stage: "unfactored"
	stage: "2-layer"
	}
	recurrent_param {
	num_output: 1000
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	}
	}
	layer {
	name: "lstm1"
	type: "LSTM"
	bottom: "embedded_input_frames"
	bottom: "cont_sentence"
	top: "lstm1"
	include { stage: "factored" }
	recurrent_param {
	num_output: 1000
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	}
	}
	layer {
	name: "drop_lstm1"
	type: "Dropout"
	bottom: "lstm1"
	top: "lstm1"
	dropout_param { dropout_ratio: 0.5 }
	include { stage: "dropLstm1" }
	}
	layer {
	name: "concat"
	type: "Concat"
	concat_param { concat_dim: 2 } # concat along h
	bottom: "lstm1"
	bottom: "embedded_input_sentence"
	bottom: "stage_indicator_3axis"
	top: "lstm1_video_sequence"
	include { stage: "factored" }
	}
	layer {
	name: "lstm2"
	type: "LSTM"
	bottom: "lstm1_video_sequence"
	bottom: "cont_sentence"
	top: "lstm2"
	include { stage: "factored" }
	recurrent_param {
	num_output: 1000
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	}
	}
	layer {
	name: "drop_lstm2"
	type: "Dropout"
	bottom: "lstm2"
	top: "lstm2"
	dropout_param { dropout_ratio: 0.5 }
	include { stage: "dropLstm2" }
	}
	layer {
	name: "predict"
	type: "InnerProduct"
	bottom: "lstm1"
	top: "predict"
	param {
	lr_mult: 1
	decay_mult: 1
	}
	param {
	lr_mult: 2
	decay_mult: 0
	}
	exclude { stage: "2-layer" }
	inner_product_param {
	num_output: 46168 # youtube_movie_vocab + 1
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	axis: 2
	}
	}
	layer {
	name: "predict"
	type: "InnerProduct"
	bottom: "lstm2"
	top: "predict"
	param {
	lr_mult: 1
	decay_mult: 1
	}
	param {
	lr_mult: 2
	decay_mult: 0
	}
	include { stage: "2-layer" }
	inner_product_param {
	num_output: 46168 # youtube_movie_vocab + 1
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	axis: 2
	}
	}
	layer {
	name: "cross_entropy_loss"
	type: "SoftmaxWithLoss"
	bottom: "predict"
	bottom: "target_sentence"
	top: "cross_entropy_loss"
	loss_weight: 20
	loss_param {
	ignore_label: -1
	}
	softmax_param {
	axis: 2
	}
	}
	layer {
	name: "accuracy"
	type: "Accuracy"
	bottom: "predict"
	bottom: "target_sentence"
	top: "accuracy"
	include { phase: TEST }
	accuracy_param {
	axis: 2
	ignore_label: -1
	}
	}
	net: "./s2vt.prototxt"

	# s2vt.prototxt supports multiple sequence to sequence architectures:
	# (1) stage: 'factored' stage: '2-layer'
	# (2) stage: 'unfactored' stage: '1-layer'
	# (3) stage: 'unfactored' stage: '2-layer'
	# Addons:
	# (a) stage: 'dropFc7' [input frame feature dropout]
	# (b) stage: 'dropEn' [text feature dropout after embedding]
	# (c) stage: 'dropLstm1' [dropout on output of lstm1]
	# (d) stage: 'dropLstm2' [dropout on output of lstm2]
	#
	# This solver uses variant (1) which performed best on the youtube dataset.
	#
	# To use a different variant, modify the states (train_state, test_state)
	# below as appropriate:

	train_state: { stage: 'factored' stage: '2-layer' }
	test_iter: 25
	test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-train' }
	test_iter: 25
	test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-val' }
	test_interval: 1000
	base_lr: 0.01
	lr_policy: "step"
	gamma: 0.5
	stepsize: 20000
	display: 1
	max_iter: 18000
	momentum: 0.9
	weight_decay: 0.0000
	snapshot: 1000
	snapshot_prefix: "./snapshots/s2vt_youtube_vgg"
	solver_mode: GPU
	random_seed: 1701
	average_loss: 100
	clip_gradients: 10