moskewcz · August 29, 2015 14:17
diff --git a/alexnet_ng_conv.sgemm.py b/alexnet_ng_conv.sgemm.py
 data = NDA("data",num_img,3,227,227) # SOURCE num,chan,y,x
 conv1_filts = NDA("conv1_filts",96,3,11,11) # SOURCE out_chan,in_chan,y,x
 conv1_biases = NDA("conv1_biases",96) # SOURCE out_chan
 conv1 = NDA("conv1",num_img,96,55,55) # num,chan,y,x
 data_one_row_per_patch_buf = NDA("data_one_row_per_patch_buf",3025,363)
 for i in range(0,num_img):
  patches_to_rows( src=data[i,:,:,:], dest=data_one_row_per_patch_buf, in_pad="0 0 0 0",stride="4 4" ) # one copy per output elem
  conv1 = data_one_row_per_patch_buf * transpose(reshape(conv1_filts,96,363)) # sgemm: MxNxK == 3025x363x96
 ReLU(name="relu1",in_place=[conv1])
 norm1 = NDA("norm1",num_img,96,55,55) # num,chan,y,x
 LRN(name="norm1",bots=[ conv1 ],tops=[ norm1 ],
 	in_pad="0 0 0 0",stride="1 1")
 pool1 = NDA("pool1",num_img,96,27,27) # num,chan,y,x
 Pooling(name="pool1",bots=[ norm1 ],tops=[ pool1 ],
 	in_pad="0 0 0 0",stride="2 2")
 conv2_filts = NDA("conv2_filts",256,96,5,5) # SOURCE out_chan,in_chan,y,x
 conv2_biases = NDA("conv2_biases",256) # SOURCE out_chan
 conv2 = NDA("conv2",num_img,256,27,27) # num,chan,y,x
 pool1_one_row_per_patch_buf = NDA("pool1_one_row_per_patch_buf",729,2400)
 for i in range(0,num_img):
  patches_to_rows( src=pool1[i,:,:,:], dest=pool1_one_row_per_patch_buf, in_pad="2 2 2 2",stride="1 1" ) # one copy per output elem
  conv2 = pool1_one_row_per_patch_buf * transpose(reshape(conv2_filts,256,2400)) # sgemm: MxNxK == 729x2400x256
 ReLU(name="relu2",in_place=[conv2])
 norm2 = NDA("norm2",num_img,256,27,27) # num,chan,y,x
 LRN(name="norm2",bots=[ conv2 ],tops=[ norm2 ],
 	in_pad="0 0 0 0",stride="1 1")
 pool2 = NDA("pool2",num_img,256,13,13) # num,chan,y,x
 Pooling(name="pool2",bots=[ norm2 ],tops=[ pool2 ],
 	in_pad="0 0 0 0",stride="2 2")
 conv3_filts = NDA("conv3_filts",384,256,3,3) # SOURCE out_chan,in_chan,y,x
 conv3_biases = NDA("conv3_biases",384) # SOURCE out_chan
 conv3 = NDA("conv3",num_img,384,13,13) # num,chan,y,x
 pool2_one_row_per_patch_buf = NDA("pool2_one_row_per_patch_buf",169,2304)
 for i in range(0,num_img):
  patches_to_rows( src=pool2[i,:,:,:], dest=pool2_one_row_per_patch_buf, in_pad="1 1 1 1",stride="1 1" ) # one copy per output elem
  conv3 = pool2_one_row_per_patch_buf * transpose(reshape(conv3_filts,384,2304)) # sgemm: MxNxK == 169x2304x384
 ReLU(name="relu3",in_place=[conv3])
 conv4_filts = NDA("conv4_filts",384,384,3,3) # SOURCE out_chan,in_chan,y,x
 conv4_biases = NDA("conv4_biases",384) # SOURCE out_chan
 conv4 = NDA("conv4",num_img,384,13,13) # num,chan,y,x
 conv3_one_row_per_patch_buf = NDA("conv3_one_row_per_patch_buf",169,3456)
 for i in range(0,num_img):
  patches_to_rows( src=conv3[i,:,:,:], dest=conv3_one_row_per_patch_buf, in_pad="1 1 1 1",stride="1 1" ) # one copy per output elem
  conv4 = conv3_one_row_per_patch_buf * transpose(reshape(conv4_filts,384,3456)) # sgemm: MxNxK == 169x3456x384
 ReLU(name="relu4",in_place=[conv4])
 conv5_filts = NDA("conv5_filts",256,384,3,3) # SOURCE out_chan,in_chan,y,x
 conv5_biases = NDA("conv5_biases",256) # SOURCE out_chan
 conv5 = NDA("conv5",num_img,256,13,13) # num,chan,y,x
 conv4_one_row_per_patch_buf = NDA("conv4_one_row_per_patch_buf",169,3456)
 for i in range(0,num_img):
  patches_to_rows( src=conv4[i,:,:,:], dest=conv4_one_row_per_patch_buf, in_pad="1 1 1 1",stride="1 1" ) # one copy per output elem
  conv5 = conv4_one_row_per_patch_buf * transpose(reshape(conv5_filts,256,3456)) # sgemm: MxNxK == 169x3456x256
 ReLU(name="relu5",in_place=[conv5])
 pool5 = NDA("pool5",num_img,256,6,6) # num,chan,y,x
 Pooling(name="pool5",bots=[ conv5 ],tops=[ pool5 ],
 	in_pad="0 0 0 0",stride="2 2")
 fc6_conv_filts = NDA("fc6_conv_filts",4096,256,6,6) # SOURCE out_chan,in_chan,y,x
 fc6_conv_biases = NDA("fc6_conv_biases",4096) # SOURCE out_chan
 fc6 = NDA("fc6",num_img,4096,1,1) # num,chan,y,x
 pool5_one_row_per_patch_buf = NDA("pool5_one_row_per_patch_buf",1,9216)
 for i in range(0,num_img):
  patches_to_rows( src=pool5[i,:,:,:], dest=pool5_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  fc6 = pool5_one_row_per_patch_buf * transpose(reshape(fc6_conv_filts,4096,9216)) # sgemm: MxNxK == 1x9216x4096
 ReLU(name="relu6",in_place=[fc6])
 Dropout(name="drop6",in_place=[fc6])
 fc7_conv_filts = NDA("fc7_conv_filts",4096,4096,1,1) # SOURCE out_chan,in_chan,y,x
 fc7_conv_biases = NDA("fc7_conv_biases",4096) # SOURCE out_chan
 fc7 = NDA("fc7",num_img,4096,1,1) # num,chan,y,x
 fc6_one_row_per_patch_buf = NDA("fc6_one_row_per_patch_buf",1,4096)
 for i in range(0,num_img):
  patches_to_rows( src=fc6[i,:,:,:], dest=fc6_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  fc7 = fc6_one_row_per_patch_buf * transpose(reshape(fc7_conv_filts,4096,4096)) # sgemm: MxNxK == 1x4096x4096
 ReLU(name="relu7",in_place=[fc7])
 Dropout(name="drop7",in_place=[fc7])
 fc8_conv_filts = NDA("fc8_conv_filts",1000,4096,1,1) # SOURCE out_chan,in_chan,y,x
 fc8_conv_biases = NDA("fc8_conv_biases",1000) # SOURCE out_chan
 fc8 = NDA("fc8",num_img,1000,1,1) # SINK num,chan,y,x
 fc7_one_row_per_patch_buf = NDA("fc7_one_row_per_patch_buf",1,4096)
 for i in range(0,num_img):
  patches_to_rows( src=fc7[i,:,:,:], dest=fc7_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  fc8 = fc7_one_row_per_patch_buf * transpose(reshape(fc8_conv_filts,1000,4096)) # sgemm: MxNxK == 1x4096x1000
diff --git a/alexnet_ng_conv.txt b/alexnet_ng_conv.txt
 data = NDA("data",num_img,3,227,227) # SOURCE num,chan,y,x
 conv1_filts = NDA("conv1_filts",96,3,11,11) # SOURCE out_chan,in_chan,y,x
 conv1_biases = NDA("conv1_biases",96) # SOURCE out_chan
 conv1 = NDA("conv1",num_img,96,55,55) # num,chan,y,x
 Convolution(name="conv1",bots=[ data ],tops=[ conv1 ],filts=conv1_filts,biases=conv1_biases,
 	in_pad="0 0 0 0",stride="4 4")
 ReLU(name="relu1",in_place=[conv1])
 norm1 = NDA("norm1",num_img,96,55,55) # num,chan,y,x
 LRN(name="norm1",bots=[ conv1 ],tops=[ norm1 ],
 	in_pad="0 0 0 0",stride="1 1")
 pool1 = NDA("pool1",num_img,96,27,27) # num,chan,y,x
 Pooling(name="pool1",bots=[ norm1 ],tops=[ pool1 ],
 	in_pad="0 0 0 0",stride="2 2")
 conv2_filts = NDA("conv2_filts",256,96,5,5) # SOURCE out_chan,in_chan,y,x
 conv2_biases = NDA("conv2_biases",256) # SOURCE out_chan
 conv2 = NDA("conv2",num_img,256,27,27) # num,chan,y,x
 Convolution(name="conv2",bots=[ pool1 ],tops=[ conv2 ],filts=conv2_filts,biases=conv2_biases,
 	in_pad="2 2 2 2",stride="1 1")
 ReLU(name="relu2",in_place=[conv2])
 norm2 = NDA("norm2",num_img,256,27,27) # num,chan,y,x
 LRN(name="norm2",bots=[ conv2 ],tops=[ norm2 ],
 	in_pad="0 0 0 0",stride="1 1")
 pool2 = NDA("pool2",num_img,256,13,13) # num,chan,y,x
 Pooling(name="pool2",bots=[ norm2 ],tops=[ pool2 ],
 	in_pad="0 0 0 0",stride="2 2")
 conv3_filts = NDA("conv3_filts",384,256,3,3) # SOURCE out_chan,in_chan,y,x
 conv3_biases = NDA("conv3_biases",384) # SOURCE out_chan
 conv3 = NDA("conv3",num_img,384,13,13) # num,chan,y,x
 Convolution(name="conv3",bots=[ pool2 ],tops=[ conv3 ],filts=conv3_filts,biases=conv3_biases,
 	in_pad="1 1 1 1",stride="1 1")
 ReLU(name="relu3",in_place=[conv3])
 conv4_filts = NDA("conv4_filts",384,384,3,3) # SOURCE out_chan,in_chan,y,x
 conv4_biases = NDA("conv4_biases",384) # SOURCE out_chan
 conv4 = NDA("conv4",num_img,384,13,13) # num,chan,y,x
 Convolution(name="conv4",bots=[ conv3 ],tops=[ conv4 ],filts=conv4_filts,biases=conv4_biases,
 	in_pad="1 1 1 1",stride="1 1")
 ReLU(name="relu4",in_place=[conv4])
 conv5_filts = NDA("conv5_filts",256,384,3,3) # SOURCE out_chan,in_chan,y,x
 conv5_biases = NDA("conv5_biases",256) # SOURCE out_chan
 conv5 = NDA("conv5",num_img,256,13,13) # num,chan,y,x
 Convolution(name="conv5",bots=[ conv4 ],tops=[ conv5 ],filts=conv5_filts,biases=conv5_biases,
 	in_pad="1 1 1 1",stride="1 1")
 ReLU(name="relu5",in_place=[conv5])
 pool5 = NDA("pool5",num_img,256,6,6) # num,chan,y,x
 Pooling(name="pool5",bots=[ conv5 ],tops=[ pool5 ],
 	in_pad="0 0 0 0",stride="2 2")
 fc6_conv_filts = NDA("fc6_conv_filts",4096,256,6,6) # SOURCE out_chan,in_chan,y,x
 fc6_conv_biases = NDA("fc6_conv_biases",4096) # SOURCE out_chan
 fc6 = NDA("fc6",num_img,4096,1,1) # num,chan,y,x
 Convolution(name="fc6_conv",bots=[ pool5 ],tops=[ fc6 ],filts=fc6_conv_filts,biases=fc6_conv_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu6",in_place=[fc6])
 Dropout(name="drop6",in_place=[fc6])
 fc7_conv_filts = NDA("fc7_conv_filts",4096,4096,1,1) # SOURCE out_chan,in_chan,y,x
 fc7_conv_biases = NDA("fc7_conv_biases",4096) # SOURCE out_chan
 fc7 = NDA("fc7",num_img,4096,1,1) # num,chan,y,x
 Convolution(name="fc7_conv",bots=[ fc6 ],tops=[ fc7 ],filts=fc7_conv_filts,biases=fc7_conv_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu7",in_place=[fc7])
 Dropout(name="drop7",in_place=[fc7])
 fc8_conv_filts = NDA("fc8_conv_filts",1000,4096,1,1) # SOURCE out_chan,in_chan,y,x
 fc8_conv_biases = NDA("fc8_conv_biases",1000) # SOURCE out_chan
 fc8 = NDA("fc8",num_img,1000,1,1) # SINK num,chan,y,x
 Convolution(name="fc8_conv",bots=[ fc7 ],tops=[ fc8 ],filts=fc8_conv_filts,biases=fc8_conv_biases,
 	in_pad="0 0 0 0",stride="1 1")
diff --git a/alexnet_ng_conv_train_val.prototxt b/alexnet_ng_conv_train_val.prototxt
 name: "AlexNet"
 layer {
  name: "data"
  type: "Data"
  top: "data"
  top: "label"
  include {
    phase: TRAIN
  }
  transform_param {
    mirror: true
    crop_size: 227
    mean_file: "/scratch/datasets/data/ilsvrc12/imagenet_mean.binaryproto"
  }
  data_param {
    source: "/scratch/datasets/imagenet_classification/ilsvrc12_train_lmdb"
    batch_size: 256
    backend: LMDB
  }
 }
 layer {
  name: "data"
  type: "Data"
  top: "data"
  top: "label"
  include {
    phase: TEST
  }
  transform_param {
    mirror: false
    crop_size: 227
    mean_file: "/scratch/datasets/data/ilsvrc12/imagenet_mean.binaryproto"
  }
  data_param {
    source: "/scratch/datasets/imagenet_classification/ilsvrc12_val_lmdb"
    batch_size: 50
    backend: LMDB
  }
 }
 layer {
  name: "conv1"
  type: "Convolution"
  bottom: "data"
  top: "conv1"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 96
    kernel_size: 11
    stride: 4
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layer {
  name: "relu1"
  type: "ReLU"
  bottom: "conv1"
  top: "conv1"
 }
 layer {
  name: "norm1"
  type: "LRN"
  bottom: "conv1"
  top: "norm1"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
 }
 layer {
  name: "pool1"
  type: "Pooling"
  bottom: "norm1"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
 }
 layer {
  name: "conv2"
  type: "Convolution"
  bottom: "pool1"
  top: "conv2"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 256
    pad: 2
    kernel_size: 5
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
 }
 layer {
  name: "relu2"
  type: "ReLU"
  bottom: "conv2"
  top: "conv2"
 }
 layer {
  name: "norm2"
  type: "LRN"
  bottom: "conv2"
  top: "norm2"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
 }
 layer {
  name: "pool2"
  type: "Pooling"
  bottom: "norm2"
  top: "pool2"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
 }
 layer {
  name: "conv3"
  type: "Convolution"
  bottom: "pool2"
  top: "conv3"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layer {
  name: "relu3"
  type: "ReLU"
  bottom: "conv3"
  top: "conv3"
 }
 layer {
  name: "conv4"
  type: "Convolution"
  bottom: "conv3"
  top: "conv4"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
 }
 layer {
  name: "relu4"
  type: "ReLU"
  bottom: "conv4"
  top: "conv4"
 }
 layer {
  name: "conv5"
  type: "Convolution"
  bottom: "conv4"
  top: "conv5"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
 }
 layer {
  name: "relu5"
  type: "ReLU"
  bottom: "conv5"
  top: "conv5"
 }
 layer {
  name: "pool5"
  type: "Pooling"
  bottom: "conv5"
  top: "pool5"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
 }
 layer {
  name: "fc6-conv"
  type: "Convolution"
  bottom: "pool5"
  top: "fc6"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 4096
    kernel_size: 6
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
 }
 layer {
  name: "relu6"
  type: "ReLU"
  bottom: "fc6"
  top: "fc6"
 }
 layer {
  name: "drop6"
  type: "Dropout"
  bottom: "fc6"
  top: "fc6"
  dropout_param {
    dropout_ratio: 0.5
  }
 }
 layer {
  name: "fc7-conv"
  type: "Convolution"
  bottom: "fc6"
  top: "fc7"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 4096
    kernel_size: 1
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
 }
 layer {
  name: "relu7"
  type: "ReLU"
  bottom: "fc7"
  top: "fc7"
 }
 layer {
  name: "drop7"
  type: "Dropout"
  bottom: "fc7"
  top: "fc7"
  dropout_param {
    dropout_ratio: 0.5
  }
 }
 layer {
  name: "fc8-conv"
  type: "Convolution"
  bottom: "fc7"
  top: "fc8"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 1000
    kernel_size: 1
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layer {
  name: "accuracy"
  type: "Accuracy"
  bottom: "fc8"
  bottom: "label"
  top: "accuracy"
  include {
    phase: TEST
  }
 }
 layer {
  name: "loss"
  type: "SoftmaxWithLoss"
  bottom: "fc8"
  bottom: "label"
  top: "loss"
 }
diff --git a/CDNN_OPS_LIST.md b/CDNN_OPS_LIST.md
diff --git a/googlenet.txt b/googlenet.txt
 data = NDA("data",num_img,3,224,224) # SOURCE num,chan,y,x
 conv1_filts = NDA("conv1_filts",64,3,7,7) # SOURCE out_chan,in_chan,y,x
 conv1_biases = NDA("conv1_biases",64) # SOURCE out_chan
 conv1 = NDA("conv1",num_img,64,112,112) # num,chan,y,x
 Convolution(name="conv1",bots=[ data ],tops=[ conv1 ],filts=conv1_filts,biases=conv1_biases,
 	in_pad="3 3 3 3",stride="2 2")
 ReLU(name="relu1",in_place=[conv1])
 pool1 = NDA("pool1",num_img,64,56,56) # num,chan,y,x
 Pooling(name="pool1",bots=[ conv1 ],tops=[ pool1 ],
 	in_pad="0 0 0 0",stride="2 2")
 norm1 = NDA("norm1",num_img,64,56,56) # num,chan,y,x
 LRN(name="norm1",bots=[ pool1 ],tops=[ norm1 ],
 	in_pad="0 0 0 0",stride="1 1")
 reduction2_filts = NDA("reduction2_filts",64,64,1,1) # SOURCE out_chan,in_chan,y,x
 reduction2_biases = NDA("reduction2_biases",64) # SOURCE out_chan
 reduction2 = NDA("reduction2",num_img,64,56,56) # num,chan,y,x
 Convolution(name="reduction2",bots=[ norm1 ],tops=[ reduction2 ],filts=reduction2_filts,biases=reduction2_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_reduction2",in_place=[reduction2])
 conv2_filts = NDA("conv2_filts",192,64,3,3) # SOURCE out_chan,in_chan,y,x
 conv2_biases = NDA("conv2_biases",192) # SOURCE out_chan
 conv2 = NDA("conv2",num_img,192,56,56) # num,chan,y,x
 Convolution(name="conv2",bots=[ reduction2 ],tops=[ conv2 ],filts=conv2_filts,biases=conv2_biases,
 	in_pad="1 1 1 1",stride="1 1")
 ReLU(name="relu2",in_place=[conv2])
 norm2 = NDA("norm2",num_img,192,56,56) # num,chan,y,x
 LRN(name="norm2",bots=[ conv2 ],tops=[ norm2 ],
 	in_pad="0 0 0 0",stride="1 1")
 pool2 = NDA("pool2",num_img,192,28,28) # num,chan,y,x
 Pooling(name="pool2",bots=[ norm2 ],tops=[ pool2 ],
 	in_pad="0 0 0 0",stride="2 2")
 icp1_reduction1_filts = NDA("icp1_reduction1_filts",96,192,1,1) # SOURCE out_chan,in_chan,y,x
 icp1_reduction1_biases = NDA("icp1_reduction1_biases",96) # SOURCE out_chan
 icp1_reduction1 = NDA("icp1_reduction1",num_img,96,28,28) # num,chan,y,x
 Convolution(name="icp1_reduction1",bots=[ pool2 ],tops=[ icp1_reduction1 ],filts=icp1_reduction1_filts,biases=icp1_reduction1_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp1_reduction1",in_place=[icp1_reduction1])
 icp1_out1_filts = NDA("icp1_out1_filts",128,96,3,3) # SOURCE out_chan,in_chan,y,x
 icp1_out1_biases = NDA("icp1_out1_biases",128) # SOURCE out_chan
 icp1_out1 = NDA("icp1_out1",num_img,128,28,28) # num,chan,y,x
 Convolution(name="icp1_out1",bots=[ icp1_reduction1 ],tops=[ icp1_out1 ],filts=icp1_out1_filts,biases=icp1_out1_biases,
 	in_pad="1 1 1 1",stride="1 1")
 ReLU(name="relu_icp1_out1",in_place=[icp1_out1])
 icp1_reduction2_filts = NDA("icp1_reduction2_filts",16,192,1,1) # SOURCE out_chan,in_chan,y,x
 icp1_reduction2_biases = NDA("icp1_reduction2_biases",16) # SOURCE out_chan
 icp1_reduction2 = NDA("icp1_reduction2",num_img,16,28,28) # num,chan,y,x
 Convolution(name="icp1_reduction2",bots=[ pool2 ],tops=[ icp1_reduction2 ],filts=icp1_reduction2_filts,biases=icp1_reduction2_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp1_reduction2",in_place=[icp1_reduction2])
 icp1_out2_filts = NDA("icp1_out2_filts",32,16,5,5) # SOURCE out_chan,in_chan,y,x
 icp1_out2_biases = NDA("icp1_out2_biases",32) # SOURCE out_chan
 icp1_out2 = NDA("icp1_out2",num_img,32,28,28) # num,chan,y,x
 Convolution(name="icp1_out2",bots=[ icp1_reduction2 ],tops=[ icp1_out2 ],filts=icp1_out2_filts,biases=icp1_out2_biases,
 	in_pad="2 2 2 2",stride="1 1")
 ReLU(name="relu_icp1_out2",in_place=[icp1_out2])
 icp1_pool = NDA("icp1_pool",num_img,192,28,28) # num,chan,y,x
 Pooling(name="icp1_pool",bots=[ pool2 ],tops=[ icp1_pool ],
 	in_pad="1 1 1 1",stride="1 1")
 icp1_out3_filts = NDA("icp1_out3_filts",32,192,1,1) # SOURCE out_chan,in_chan,y,x
 icp1_out3_biases = NDA("icp1_out3_biases",32) # SOURCE out_chan
 icp1_out3 = NDA("icp1_out3",num_img,32,28,28) # num,chan,y,x
 Convolution(name="icp1_out3",bots=[ icp1_pool ],tops=[ icp1_out3 ],filts=icp1_out3_filts,biases=icp1_out3_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp1_out3",in_place=[icp1_out3])
 icp1_out0_filts = NDA("icp1_out0_filts",64,192,1,1) # SOURCE out_chan,in_chan,y,x
 icp1_out0_biases = NDA("icp1_out0_biases",64) # SOURCE out_chan
 icp1_out0 = NDA("icp1_out0",num_img,64,28,28) # num,chan,y,x
 Convolution(name="icp1_out0",bots=[ pool2 ],tops=[ icp1_out0 ],filts=icp1_out0_filts,biases=icp1_out0_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp1_out0",in_place=[icp1_out0])
 icp2_in = NDA("icp2_in",num_img,256,28,28) # num,chan,y,x
 Concat(name="icp2_in",bots=[ icp1_out0, icp1_out1, icp1_out2, icp1_out3 ],tops=[ icp2_in ],
 	in_pad="0 0 0 0",stride="1 1")
 icp2_reduction1_filts = NDA("icp2_reduction1_filts",128,256,1,1) # SOURCE out_chan,in_chan,y,x
 icp2_reduction1_biases = NDA("icp2_reduction1_biases",128) # SOURCE out_chan
 icp2_reduction1 = NDA("icp2_reduction1",num_img,128,28,28) # num,chan,y,x
 Convolution(name="icp2_reduction1",bots=[ icp2_in ],tops=[ icp2_reduction1 ],filts=icp2_reduction1_filts,biases=icp2_reduction1_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp2_reduction1",in_place=[icp2_reduction1])
 icp2_out1_filts = NDA("icp2_out1_filts",192,128,3,3) # SOURCE out_chan,in_chan,y,x
 icp2_out1_biases = NDA("icp2_out1_biases",192) # SOURCE out_chan
 icp2_out1 = NDA("icp2_out1",num_img,192,28,28) # num,chan,y,x
 Convolution(name="icp2_out1",bots=[ icp2_reduction1 ],tops=[ icp2_out1 ],filts=icp2_out1_filts,biases=icp2_out1_biases,
 	in_pad="1 1 1 1",stride="1 1")
 ReLU(name="relu_icp2_out1",in_place=[icp2_out1])
 icp2_reduction2_filts = NDA("icp2_reduction2_filts",32,256,1,1) # SOURCE out_chan,in_chan,y,x
 icp2_reduction2_biases = NDA("icp2_reduction2_biases",32) # SOURCE out_chan
 icp2_reduction2 = NDA("icp2_reduction2",num_img,32,28,28) # num,chan,y,x
 Convolution(name="icp2_reduction2",bots=[ icp2_in ],tops=[ icp2_reduction2 ],filts=icp2_reduction2_filts,biases=icp2_reduction2_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp2_reduction2",in_place=[icp2_reduction2])
 icp2_out2_filts = NDA("icp2_out2_filts",96,32,5,5) # SOURCE out_chan,in_chan,y,x
 icp2_out2_biases = NDA("icp2_out2_biases",96) # SOURCE out_chan
 icp2_out2 = NDA("icp2_out2",num_img,96,28,28) # num,chan,y,x
 Convolution(name="icp2_out2",bots=[ icp2_reduction2 ],tops=[ icp2_out2 ],filts=icp2_out2_filts,biases=icp2_out2_biases,
 	in_pad="2 2 2 2",stride="1 1")
 ReLU(name="relu_icp2_out2",in_place=[icp2_out2])
 icp2_pool = NDA("icp2_pool",num_img,256,28,28) # num,chan,y,x
 Pooling(name="icp2_pool",bots=[ icp2_in ],tops=[ icp2_pool ],
 	in_pad="1 1 1 1",stride="1 1")
 icp2_out3_filts = NDA("icp2_out3_filts",64,256,1,1) # SOURCE out_chan,in_chan,y,x
 icp2_out3_biases = NDA("icp2_out3_biases",64) # SOURCE out_chan
 icp2_out3 = NDA("icp2_out3",num_img,64,28,28) # num,chan,y,x
 Convolution(name="icp2_out3",bots=[ icp2_pool ],tops=[ icp2_out3 ],filts=icp2_out3_filts,biases=icp2_out3_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp2_out3",in_place=[icp2_out3])
 icp2_out0_filts = NDA("icp2_out0_filts",128,256,1,1) # SOURCE out_chan,in_chan,y,x
 icp2_out0_biases = NDA("icp2_out0_biases",128) # SOURCE out_chan
 icp2_out0 = NDA("icp2_out0",num_img,128,28,28) # num,chan,y,x
 Convolution(name="icp2_out0",bots=[ icp2_in ],tops=[ icp2_out0 ],filts=icp2_out0_filts,biases=icp2_out0_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp2_out0",in_place=[icp2_out0])
 icp2_out = NDA("icp2_out",num_img,480,28,28) # num,chan,y,x
 Concat(name="icp2_out",bots=[ icp2_out0, icp2_out1, icp2_out2, icp2_out3 ],tops=[ icp2_out ],
 	in_pad="0 0 0 0",stride="1 1")
 icp3_in = NDA("icp3_in",num_img,480,14,14) # num,chan,y,x
 Pooling(name="icp3_in",bots=[ icp2_out ],tops=[ icp3_in ],
 	in_pad="0 0 0 0",stride="2 2")
 icp3_reduction1_filts = NDA("icp3_reduction1_filts",96,480,1,1) # SOURCE out_chan,in_chan,y,x
 icp3_reduction1_biases = NDA("icp3_reduction1_biases",96) # SOURCE out_chan
 icp3_reduction1 = NDA("icp3_reduction1",num_img,96,14,14) # num,chan,y,x
 Convolution(name="icp3_reduction1",bots=[ icp3_in ],tops=[ icp3_reduction1 ],filts=icp3_reduction1_filts,biases=icp3_reduction1_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp3_reduction1",in_place=[icp3_reduction1])
 icp3_out1_filts = NDA("icp3_out1_filts",208,96,3,3) # SOURCE out_chan,in_chan,y,x
 icp3_out1_biases = NDA("icp3_out1_biases",208) # SOURCE out_chan
 icp3_out1 = NDA("icp3_out1",num_img,208,14,14) # num,chan,y,x
 Convolution(name="icp3_out1",bots=[ icp3_reduction1 ],tops=[ icp3_out1 ],filts=icp3_out1_filts,biases=icp3_out1_biases,
 	in_pad="1 1 1 1",stride="1 1")
 ReLU(name="relu_icp3_out1",in_place=[icp3_out1])
 icp3_reduction2_filts = NDA("icp3_reduction2_filts",16,480,1,1) # SOURCE out_chan,in_chan,y,x
 icp3_reduction2_biases = NDA("icp3_reduction2_biases",16) # SOURCE out_chan
 icp3_reduction2 = NDA("icp3_reduction2",num_img,16,14,14) # num,chan,y,x
 Convolution(name="icp3_reduction2",bots=[ icp3_in ],tops=[ icp3_reduction2 ],filts=icp3_reduction2_filts,biases=icp3_reduction2_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp3_reduction2",in_place=[icp3_reduction2])
 icp3_out2_filts = NDA("icp3_out2_filts",48,16,5,5) # SOURCE out_chan,in_chan,y,x
 icp3_out2_biases = NDA("icp3_out2_biases",48) # SOURCE out_chan
 icp3_out2 = NDA("icp3_out2",num_img,48,14,14) # num,chan,y,x
 Convolution(name="icp3_out2",bots=[ icp3_reduction2 ],tops=[ icp3_out2 ],filts=icp3_out2_filts,biases=icp3_out2_biases,
 	in_pad="2 2 2 2",stride="1 1")
 ReLU(name="relu_icp3_out2",in_place=[icp3_out2])
 icp3_pool = NDA("icp3_pool",num_img,480,14,14) # num,chan,y,x
 Pooling(name="icp3_pool",bots=[ icp3_in ],tops=[ icp3_pool ],
 	in_pad="1 1 1 1",stride="1 1")
 icp3_out3_filts = NDA("icp3_out3_filts",64,480,1,1) # SOURCE out_chan,in_chan,y,x
 icp3_out3_biases = NDA("icp3_out3_biases",64) # SOURCE out_chan
 icp3_out3 = NDA("icp3_out3",num_img,64,14,14) # num,chan,y,x
 Convolution(name="icp3_out3",bots=[ icp3_pool ],tops=[ icp3_out3 ],filts=icp3_out3_filts,biases=icp3_out3_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp3_out3",in_place=[icp3_out3])
 icp3_out0_filts = NDA("icp3_out0_filts",192,480,1,1) # SOURCE out_chan,in_chan,y,x
 icp3_out0_biases = NDA("icp3_out0_biases",192) # SOURCE out_chan
 icp3_out0 = NDA("icp3_out0",num_img,192,14,14) # num,chan,y,x
 Convolution(name="icp3_out0",bots=[ icp3_in ],tops=[ icp3_out0 ],filts=icp3_out0_filts,biases=icp3_out0_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp3_out0",in_place=[icp3_out0])
 icp3_out = NDA("icp3_out",num_img,512,14,14) # num,chan,y,x
 Concat(name="icp3_out",bots=[ icp3_out0, icp3_out1, icp3_out2, icp3_out3 ],tops=[ icp3_out ],
 	in_pad="0 0 0 0",stride="1 1")
 cls1_pool = NDA("cls1_pool",num_img,512,4,4) # num,chan,y,x
 Pooling(name="cls1_pool",bots=[ icp3_out ],tops=[ cls1_pool ],
 	in_pad="0 0 0 0",stride="3 3")
 cls1_reduction_filts = NDA("cls1_reduction_filts",128,512,1,1) # SOURCE out_chan,in_chan,y,x
 cls1_reduction_biases = NDA("cls1_reduction_biases",128) # SOURCE out_chan
 cls1_reduction = NDA("cls1_reduction",num_img,128,4,4) # num,chan,y,x
 Convolution(name="cls1_reduction",bots=[ cls1_pool ],tops=[ cls1_reduction ],filts=cls1_reduction_filts,biases=cls1_reduction_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_cls1_reduction",in_place=[cls1_reduction])
 cls1_fc1_filts = NDA("cls1_fc1_filts",1024,128,4,4) # SOURCE out_chan,in_chan,y,x
 cls1_fc1_biases = NDA("cls1_fc1_biases",1024) # SOURCE out_chan
 cls1_fc1 = NDA("cls1_fc1",num_img,1024,1,1) # num,chan,y,x
 InnerProduct(name="cls1_fc1",bots=[ cls1_reduction ],tops=[ cls1_fc1 ],filts=cls1_fc1_filts,biases=cls1_fc1_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_cls1_fc1",in_place=[cls1_fc1])
 Dropout(name="cls1_drop",in_place=[cls1_fc1])
 cls1_fc2__filts = NDA("cls1_fc2__filts",33,1024,1,1) # SOURCE out_chan,in_chan,y,x
 cls1_fc2__biases = NDA("cls1_fc2__biases",33) # SOURCE out_chan
 cls1_fc2_ = NDA("cls1_fc2_",num_img,33,1,1) # SINK num,chan,y,x
 InnerProduct(name="cls1_fc2_",bots=[ cls1_fc1 ],tops=[ cls1_fc2_ ],filts=cls1_fc2__filts,biases=cls1_fc2__biases,
 	in_pad="0 0 0 0",stride="1 1")
 icp4_reduction1_filts = NDA("icp4_reduction1_filts",112,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp4_reduction1_biases = NDA("icp4_reduction1_biases",112) # SOURCE out_chan
 icp4_reduction1 = NDA("icp4_reduction1",num_img,112,14,14) # num,chan,y,x
 Convolution(name="icp4_reduction1",bots=[ icp3_out ],tops=[ icp4_reduction1 ],filts=icp4_reduction1_filts,biases=icp4_reduction1_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp4_reduction1",in_place=[icp4_reduction1])
 icp4_out1_filts = NDA("icp4_out1_filts",224,112,3,3) # SOURCE out_chan,in_chan,y,x
 icp4_out1_biases = NDA("icp4_out1_biases",224) # SOURCE out_chan
 icp4_out1 = NDA("icp4_out1",num_img,224,14,14) # num,chan,y,x
 Convolution(name="icp4_out1",bots=[ icp4_reduction1 ],tops=[ icp4_out1 ],filts=icp4_out1_filts,biases=icp4_out1_biases,
 	in_pad="1 1 1 1",stride="1 1")
 ReLU(name="relu_icp4_out1",in_place=[icp4_out1])
 icp4_reduction2_filts = NDA("icp4_reduction2_filts",24,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp4_reduction2_biases = NDA("icp4_reduction2_biases",24) # SOURCE out_chan
 icp4_reduction2 = NDA("icp4_reduction2",num_img,24,14,14) # num,chan,y,x
 Convolution(name="icp4_reduction2",bots=[ icp3_out ],tops=[ icp4_reduction2 ],filts=icp4_reduction2_filts,biases=icp4_reduction2_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp4_reduction2",in_place=[icp4_reduction2])
 icp4_out2_filts = NDA("icp4_out2_filts",64,24,5,5) # SOURCE out_chan,in_chan,y,x
 icp4_out2_biases = NDA("icp4_out2_biases",64) # SOURCE out_chan
 icp4_out2 = NDA("icp4_out2",num_img,64,14,14) # num,chan,y,x
 Convolution(name="icp4_out2",bots=[ icp4_reduction2 ],tops=[ icp4_out2 ],filts=icp4_out2_filts,biases=icp4_out2_biases,
 	in_pad="2 2 2 2",stride="1 1")
 ReLU(name="relu_icp4_out2",in_place=[icp4_out2])
 icp4_pool = NDA("icp4_pool",num_img,512,14,14) # num,chan,y,x
 Pooling(name="icp4_pool",bots=[ icp3_out ],tops=[ icp4_pool ],
 	in_pad="1 1 1 1",stride="1 1")
 icp4_out3_filts = NDA("icp4_out3_filts",64,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp4_out3_biases = NDA("icp4_out3_biases",64) # SOURCE out_chan
 icp4_out3 = NDA("icp4_out3",num_img,64,14,14) # num,chan,y,x
 Convolution(name="icp4_out3",bots=[ icp4_pool ],tops=[ icp4_out3 ],filts=icp4_out3_filts,biases=icp4_out3_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp4_out3",in_place=[icp4_out3])
 icp4_out0_filts = NDA("icp4_out0_filts",160,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp4_out0_biases = NDA("icp4_out0_biases",160) # SOURCE out_chan
 icp4_out0 = NDA("icp4_out0",num_img,160,14,14) # num,chan,y,x
 Convolution(name="icp4_out0",bots=[ icp3_out ],tops=[ icp4_out0 ],filts=icp4_out0_filts,biases=icp4_out0_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp4_out0",in_place=[icp4_out0])
 icp4_out = NDA("icp4_out",num_img,512,14,14) # num,chan,y,x
 Concat(name="icp4_out",bots=[ icp4_out0, icp4_out1, icp4_out2, icp4_out3 ],tops=[ icp4_out ],
 	in_pad="0 0 0 0",stride="1 1")
 icp5_reduction1_filts = NDA("icp5_reduction1_filts",128,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp5_reduction1_biases = NDA("icp5_reduction1_biases",128) # SOURCE out_chan
 icp5_reduction1 = NDA("icp5_reduction1",num_img,128,14,14) # num,chan,y,x
 Convolution(name="icp5_reduction1",bots=[ icp4_out ],tops=[ icp5_reduction1 ],filts=icp5_reduction1_filts,biases=icp5_reduction1_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp5_reduction1",in_place=[icp5_reduction1])
 icp5_out1_filts = NDA("icp5_out1_filts",256,128,3,3) # SOURCE out_chan,in_chan,y,x
 icp5_out1_biases = NDA("icp5_out1_biases",256) # SOURCE out_chan
 icp5_out1 = NDA("icp5_out1",num_img,256,14,14) # num,chan,y,x
 Convolution(name="icp5_out1",bots=[ icp5_reduction1 ],tops=[ icp5_out1 ],filts=icp5_out1_filts,biases=icp5_out1_biases,
 	in_pad="1 1 1 1",stride="1 1")
 ReLU(name="relu_icp5_out1",in_place=[icp5_out1])
 icp5_reduction2_filts = NDA("icp5_reduction2_filts",24,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp5_reduction2_biases = NDA("icp5_reduction2_biases",24) # SOURCE out_chan
 icp5_reduction2 = NDA("icp5_reduction2",num_img,24,14,14) # num,chan,y,x
 Convolution(name="icp5_reduction2",bots=[ icp4_out ],tops=[ icp5_reduction2 ],filts=icp5_reduction2_filts,biases=icp5_reduction2_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp5_reduction2",in_place=[icp5_reduction2])
 icp5_out2_filts = NDA("icp5_out2_filts",64,24,5,5) # SOURCE out_chan,in_chan,y,x
 icp5_out2_biases = NDA("icp5_out2_biases",64) # SOURCE out_chan
 icp5_out2 = NDA("icp5_out2",num_img,64,14,14) # num,chan,y,x
 Convolution(name="icp5_out2",bots=[ icp5_reduction2 ],tops=[ icp5_out2 ],filts=icp5_out2_filts,biases=icp5_out2_biases,
 	in_pad="2 2 2 2",stride="1 1")
 ReLU(name="relu_icp5_out2",in_place=[icp5_out2])
 icp5_pool = NDA("icp5_pool",num_img,512,14,14) # num,chan,y,x
 Pooling(name="icp5_pool",bots=[ icp4_out ],tops=[ icp5_pool ],
 	in_pad="1 1 1 1",stride="1 1")
 icp5_out3_filts = NDA("icp5_out3_filts",64,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp5_out3_biases = NDA("icp5_out3_biases",64) # SOURCE out_chan
 icp5_out3 = NDA("icp5_out3",num_img,64,14,14) # num,chan,y,x
 Convolution(name="icp5_out3",bots=[ icp5_pool ],tops=[ icp5_out3 ],filts=icp5_out3_filts,biases=icp5_out3_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp5_out3",in_place=[icp5_out3])
 icp5_out0_filts = NDA("icp5_out0_filts",128,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp5_out0_biases = NDA("icp5_out0_biases",128) # SOURCE out_chan
 icp5_out0 = NDA("icp5_out0",num_img,128,14,14) # num,chan,y,x
 Convolution(name="icp5_out0",bots=[ icp4_out ],tops=[ icp5_out0 ],filts=icp5_out0_filts,biases=icp5_out0_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp5_out0",in_place=[icp5_out0])
 icp5_out = NDA("icp5_out",num_img,512,14,14) # num,chan,y,x
 Concat(name="icp5_out",bots=[ icp5_out0, icp5_out1, icp5_out2, icp5_out3 ],tops=[ icp5_out ],
 	in_pad="0 0 0 0",stride="1 1")
 icp6_reduction1_filts = NDA("icp6_reduction1_filts",144,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp6_reduction1_biases = NDA("icp6_reduction1_biases",144) # SOURCE out_chan
 icp6_reduction1 = NDA("icp6_reduction1",num_img,144,14,14) # num,chan,y,x
 Convolution(name="icp6_reduction1",bots=[ icp5_out ],tops=[ icp6_reduction1 ],filts=icp6_reduction1_filts,biases=icp6_reduction1_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp6_reduction1",in_place=[icp6_reduction1])
 icp6_out1_filts = NDA("icp6_out1_filts",288,144,3,3) # SOURCE out_chan,in_chan,y,x
 icp6_out1_biases = NDA("icp6_out1_biases",288) # SOURCE out_chan
 icp6_out1 = NDA("icp6_out1",num_img,288,14,14) # num,chan,y,x
 Convolution(name="icp6_out1",bots=[ icp6_reduction1 ],tops=[ icp6_out1 ],filts=icp6_out1_filts,biases=icp6_out1_biases,
 	in_pad="1 1 1 1",stride="1 1")
 ReLU(name="relu_icp6_out1",in_place=[icp6_out1])
 icp6_reduction2_filts = NDA("icp6_reduction2_filts",32,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp6_reduction2_biases = NDA("icp6_reduction2_biases",32) # SOURCE out_chan
 icp6_reduction2 = NDA("icp6_reduction2",num_img,32,14,14) # num,chan,y,x
 Convolution(name="icp6_reduction2",bots=[ icp5_out ],tops=[ icp6_reduction2 ],filts=icp6_reduction2_filts,biases=icp6_reduction2_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp6_reduction2",in_place=[icp6_reduction2])
 icp6_out2_filts = NDA("icp6_out2_filts",64,32,5,5) # SOURCE out_chan,in_chan,y,x
 icp6_out2_biases = NDA("icp6_out2_biases",64) # SOURCE out_chan
 icp6_out2 = NDA("icp6_out2",num_img,64,14,14) # num,chan,y,x
 Convolution(name="icp6_out2",bots=[ icp6_reduction2 ],tops=[ icp6_out2 ],filts=icp6_out2_filts,biases=icp6_out2_biases,
 	in_pad="2 2 2 2",stride="1 1")
 ReLU(name="relu_icp6_out2",in_place=[icp6_out2])
 icp6_pool = NDA("icp6_pool",num_img,512,14,14) # num,chan,y,x
 Pooling(name="icp6_pool",bots=[ icp5_out ],tops=[ icp6_pool ],
 	in_pad="1 1 1 1",stride="1 1")
 icp6_out3_filts = NDA("icp6_out3_filts",64,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp6_out3_biases = NDA("icp6_out3_biases",64) # SOURCE out_chan
 icp6_out3 = NDA("icp6_out3",num_img,64,14,14) # num,chan,y,x
 Convolution(name="icp6_out3",bots=[ icp6_pool ],tops=[ icp6_out3 ],filts=icp6_out3_filts,biases=icp6_out3_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp6_out3",in_place=[icp6_out3])
 icp6_out0_filts = NDA("icp6_out0_filts",112,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp6_out0_biases = NDA("icp6_out0_biases",112) # SOURCE out_chan
 icp6_out0 = NDA("icp6_out0",num_img,112,14,14) # num,chan,y,x
 Convolution(name="icp6_out0",bots=[ icp5_out ],tops=[ icp6_out0 ],filts=icp6_out0_filts,biases=icp6_out0_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp6_out0",in_place=[icp6_out0])
 icp6_out = NDA("icp6_out",num_img,528,14,14) # num,chan,y,x
 Concat(name="icp6_out",bots=[ icp6_out0, icp6_out1, icp6_out2, icp6_out3 ],tops=[ icp6_out ],
 	in_pad="0 0 0 0",stride="1 1")
 cls2_pool = NDA("cls2_pool",num_img,528,4,4) # num,chan,y,x
 Pooling(name="cls2_pool",bots=[ icp6_out ],tops=[ cls2_pool ],
 	in_pad="0 0 0 0",stride="3 3")
 cls2_reduction_filts = NDA("cls2_reduction_filts",128,528,1,1) # SOURCE out_chan,in_chan,y,x
 cls2_reduction_biases = NDA("cls2_reduction_biases",128) # SOURCE out_chan
 cls2_reduction = NDA("cls2_reduction",num_img,128,4,4) # num,chan,y,x
 Convolution(name="cls2_reduction",bots=[ cls2_pool ],tops=[ cls2_reduction ],filts=cls2_reduction_filts,biases=cls2_reduction_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_cls2_reduction",in_place=[cls2_reduction])
 cls2_fc1_filts = NDA("cls2_fc1_filts",1024,128,4,4) # SOURCE out_chan,in_chan,y,x
 cls2_fc1_biases = NDA("cls2_fc1_biases",1024) # SOURCE out_chan
 cls2_fc1 = NDA("cls2_fc1",num_img,1024,1,1) # num,chan,y,x
 InnerProduct(name="cls2_fc1",bots=[ cls2_reduction ],tops=[ cls2_fc1 ],filts=cls2_fc1_filts,biases=cls2_fc1_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_cls2_fc1",in_place=[cls2_fc1])
 Dropout(name="cls2_drop",in_place=[cls2_fc1])
 cls2_fc2__filts = NDA("cls2_fc2__filts",33,1024,1,1) # SOURCE out_chan,in_chan,y,x
 cls2_fc2__biases = NDA("cls2_fc2__biases",33) # SOURCE out_chan
 cls2_fc2_ = NDA("cls2_fc2_",num_img,33,1,1) # SINK num,chan,y,x
 InnerProduct(name="cls2_fc2_",bots=[ cls2_fc1 ],tops=[ cls2_fc2_ ],filts=cls2_fc2__filts,biases=cls2_fc2__biases,
 	in_pad="0 0 0 0",stride="1 1")
 icp7_reduction1_filts = NDA("icp7_reduction1_filts",160,528,1,1) # SOURCE out_chan,in_chan,y,x
 icp7_reduction1_biases = NDA("icp7_reduction1_biases",160) # SOURCE out_chan
 icp7_reduction1 = NDA("icp7_reduction1",num_img,160,14,14) # num,chan,y,x
 Convolution(name="icp7_reduction1",bots=[ icp6_out ],tops=[ icp7_reduction1 ],filts=icp7_reduction1_filts,biases=icp7_reduction1_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp7_reduction1",in_place=[icp7_reduction1])
 icp7_out1_filts = NDA("icp7_out1_filts",320,160,3,3) # SOURCE out_chan,in_chan,y,x
 icp7_out1_biases = NDA("icp7_out1_biases",320) # SOURCE out_chan
 icp7_out1 = NDA("icp7_out1",num_img,320,14,14) # num,chan,y,x
 Convolution(name="icp7_out1",bots=[ icp7_reduction1 ],tops=[ icp7_out1 ],filts=icp7_out1_filts,biases=icp7_out1_biases,
 	in_pad="1 1 1 1",stride="1 1")
 ReLU(name="relu_icp7_out1",in_place=[icp7_out1])
 icp7_reduction2_filts = NDA("icp7_reduction2_filts",32,528,1,1) # SOURCE out_chan,in_chan,y,x
 icp7_reduction2_biases = NDA("icp7_reduction2_biases",32) # SOURCE out_chan
 icp7_reduction2 = NDA("icp7_reduction2",num_img,32,14,14) # num,chan,y,x
 Convolution(name="icp7_reduction2",bots=[ icp6_out ],tops=[ icp7_reduction2 ],filts=icp7_reduction2_filts,biases=icp7_reduction2_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp7_reduction2",in_place=[icp7_reduction2])
 icp7_out2_filts = NDA("icp7_out2_filts",128,32,5,5) # SOURCE out_chan,in_chan,y,x
 icp7_out2_biases = NDA("icp7_out2_biases",128) # SOURCE out_chan
 icp7_out2 = NDA("icp7_out2",num_img,128,14,14) # num,chan,y,x
 Convolution(name="icp7_out2",bots=[ icp7_reduction2 ],tops=[ icp7_out2 ],filts=icp7_out2_filts,biases=icp7_out2_biases,
 	in_pad="2 2 2 2",stride="1 1")
 ReLU(name="relu_icp7_out2",in_place=[icp7_out2])
 icp7_pool = NDA("icp7_pool",num_img,528,14,14) # num,chan,y,x
 Pooling(name="icp7_pool",bots=[ icp6_out ],tops=[ icp7_pool ],
 	in_pad="1 1 1 1",stride="1 1")
 icp7_out3_filts = NDA("icp7_out3_filts",128,528,1,1) # SOURCE out_chan,in_chan,y,x
 icp7_out3_biases = NDA("icp7_out3_biases",128) # SOURCE out_chan
 icp7_out3 = NDA("icp7_out3",num_img,128,14,14) # num,chan,y,x
 Convolution(name="icp7_out3",bots=[ icp7_pool ],tops=[ icp7_out3 ],filts=icp7_out3_filts,biases=icp7_out3_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp7_out3",in_place=[icp7_out3])
 icp7_out0_filts = NDA("icp7_out0_filts",256,528,1,1) # SOURCE out_chan,in_chan,y,x
 icp7_out0_biases = NDA("icp7_out0_biases",256) # SOURCE out_chan
 icp7_out0 = NDA("icp7_out0",num_img,256,14,14) # num,chan,y,x
 Convolution(name="icp7_out0",bots=[ icp6_out ],tops=[ icp7_out0 ],filts=icp7_out0_filts,biases=icp7_out0_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp7_out0",in_place=[icp7_out0])
 icp7_out = NDA("icp7_out",num_img,832,14,14) # num,chan,y,x
 Concat(name="icp7_out",bots=[ icp7_out0, icp7_out1, icp7_out2, icp7_out3 ],tops=[ icp7_out ],
 	in_pad="0 0 0 0",stride="1 1")
 icp8_in = NDA("icp8_in",num_img,832,7,7) # num,chan,y,x
 Pooling(name="icp8_in",bots=[ icp7_out ],tops=[ icp8_in ],
 	in_pad="0 0 0 0",stride="2 2")
 icp8_reduction1_filts = NDA("icp8_reduction1_filts",160,832,1,1) # SOURCE out_chan,in_chan,y,x
 icp8_reduction1_biases = NDA("icp8_reduction1_biases",160) # SOURCE out_chan
 icp8_reduction1 = NDA("icp8_reduction1",num_img,160,7,7) # num,chan,y,x
 Convolution(name="icp8_reduction1",bots=[ icp8_in ],tops=[ icp8_reduction1 ],filts=icp8_reduction1_filts,biases=icp8_reduction1_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp8_reduction1",in_place=[icp8_reduction1])
 icp8_out1_filts = NDA("icp8_out1_filts",320,160,3,3) # SOURCE out_chan,in_chan,y,x
 icp8_out1_biases = NDA("icp8_out1_biases",320) # SOURCE out_chan
 icp8_out1 = NDA("icp8_out1",num_img,320,7,7) # num,chan,y,x
 Convolution(name="icp8_out1",bots=[ icp8_reduction1 ],tops=[ icp8_out1 ],filts=icp8_out1_filts,biases=icp8_out1_biases,
 	in_pad="1 1 1 1",stride="1 1")
 ReLU(name="relu_icp8_out1",in_place=[icp8_out1])
 icp8_reduction2_filts = NDA("icp8_reduction2_filts",32,832,1,1) # SOURCE out_chan,in_chan,y,x
 icp8_reduction2_biases = NDA("icp8_reduction2_biases",32) # SOURCE out_chan
 icp8_reduction2 = NDA("icp8_reduction2",num_img,32,7,7) # num,chan,y,x
 Convolution(name="icp8_reduction2",bots=[ icp8_in ],tops=[ icp8_reduction2 ],filts=icp8_reduction2_filts,biases=icp8_reduction2_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp8_reduction2",in_place=[icp8_reduction2])
 icp8_out2_filts = NDA("icp8_out2_filts",128,32,5,5) # SOURCE out_chan,in_chan,y,x
 icp8_out2_biases = NDA("icp8_out2_biases",128) # SOURCE out_chan
 icp8_out2 = NDA("icp8_out2",num_img,128,7,7) # num,chan,y,x
 Convolution(name="icp8_out2",bots=[ icp8_reduction2 ],tops=[ icp8_out2 ],filts=icp8_out2_filts,biases=icp8_out2_biases,
 	in_pad="2 2 2 2",stride="1 1")
 ReLU(name="relu_icp8_out2",in_place=[icp8_out2])
 icp8_pool = NDA("icp8_pool",num_img,832,7,7) # num,chan,y,x
 Pooling(name="icp8_pool",bots=[ icp8_in ],tops=[ icp8_pool ],
 	in_pad="1 1 1 1",stride="1 1")
 icp8_out3_filts = NDA("icp8_out3_filts",128,832,1,1) # SOURCE out_chan,in_chan,y,x
 icp8_out3_biases = NDA("icp8_out3_biases",128) # SOURCE out_chan
 icp8_out3 = NDA("icp8_out3",num_img,128,7,7) # num,chan,y,x
 Convolution(name="icp8_out3",bots=[ icp8_pool ],tops=[ icp8_out3 ],filts=icp8_out3_filts,biases=icp8_out3_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp8_out3",in_place=[icp8_out3])
 icp8_out0_filts = NDA("icp8_out0_filts",256,832,1,1) # SOURCE out_chan,in_chan,y,x
 icp8_out0_biases = NDA("icp8_out0_biases",256) # SOURCE out_chan
 icp8_out0 = NDA("icp8_out0",num_img,256,7,7) # num,chan,y,x
 Convolution(name="icp8_out0",bots=[ icp8_in ],tops=[ icp8_out0 ],filts=icp8_out0_filts,biases=icp8_out0_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp8_out0",in_place=[icp8_out0])
 icp8_out = NDA("icp8_out",num_img,832,7,7) # num,chan,y,x
 Concat(name="icp8_out",bots=[ icp8_out0, icp8_out1, icp8_out2, icp8_out3 ],tops=[ icp8_out ],
 	in_pad="0 0 0 0",stride="1 1")
 icp9_reduction1_filts = NDA("icp9_reduction1_filts",192,832,1,1) # SOURCE out_chan,in_chan,y,x
 icp9_reduction1_biases = NDA("icp9_reduction1_biases",192) # SOURCE out_chan
 icp9_reduction1 = NDA("icp9_reduction1",num_img,192,7,7) # num,chan,y,x
 Convolution(name="icp9_reduction1",bots=[ icp8_out ],tops=[ icp9_reduction1 ],filts=icp9_reduction1_filts,biases=icp9_reduction1_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp9_reduction1",in_place=[icp9_reduction1])
 icp9_out1_filts = NDA("icp9_out1_filts",384,192,3,3) # SOURCE out_chan,in_chan,y,x
 icp9_out1_biases = NDA("icp9_out1_biases",384) # SOURCE out_chan
 icp9_out1 = NDA("icp9_out1",num_img,384,7,7) # num,chan,y,x
 Convolution(name="icp9_out1",bots=[ icp9_reduction1 ],tops=[ icp9_out1 ],filts=icp9_out1_filts,biases=icp9_out1_biases,
 	in_pad="1 1 1 1",stride="1 1")
 ReLU(name="relu_icp9_out1",in_place=[icp9_out1])
 icp9_reduction2_filts = NDA("icp9_reduction2_filts",48,832,1,1) # SOURCE out_chan,in_chan,y,x
 icp9_reduction2_biases = NDA("icp9_reduction2_biases",48) # SOURCE out_chan
 icp9_reduction2 = NDA("icp9_reduction2",num_img,48,7,7) # num,chan,y,x
 Convolution(name="icp9_reduction2",bots=[ icp8_out ],tops=[ icp9_reduction2 ],filts=icp9_reduction2_filts,biases=icp9_reduction2_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp9_reduction2",in_place=[icp9_reduction2])
 icp9_out2_filts = NDA("icp9_out2_filts",128,48,5,5) # SOURCE out_chan,in_chan,y,x
 icp9_out2_biases = NDA("icp9_out2_biases",128) # SOURCE out_chan
 icp9_out2 = NDA("icp9_out2",num_img,128,7,7) # num,chan,y,x
 Convolution(name="icp9_out2",bots=[ icp9_reduction2 ],tops=[ icp9_out2 ],filts=icp9_out2_filts,biases=icp9_out2_biases,
 	in_pad="2 2 2 2",stride="1 1")
 ReLU(name="relu_icp9_out2",in_place=[icp9_out2])
 icp9_pool = NDA("icp9_pool",num_img,832,7,7) # num,chan,y,x
 Pooling(name="icp9_pool",bots=[ icp8_out ],tops=[ icp9_pool ],
 	in_pad="1 1 1 1",stride="1 1")
 icp9_out3_filts = NDA("icp9_out3_filts",128,832,1,1) # SOURCE out_chan,in_chan,y,x
 icp9_out3_biases = NDA("icp9_out3_biases",128) # SOURCE out_chan
 icp9_out3 = NDA("icp9_out3",num_img,128,7,7) # num,chan,y,x
 Convolution(name="icp9_out3",bots=[ icp9_pool ],tops=[ icp9_out3 ],filts=icp9_out3_filts,biases=icp9_out3_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp9_out3",in_place=[icp9_out3])
 icp9_out0_filts = NDA("icp9_out0_filts",384,832,1,1) # SOURCE out_chan,in_chan,y,x
 icp9_out0_biases = NDA("icp9_out0_biases",384) # SOURCE out_chan
 icp9_out0 = NDA("icp9_out0",num_img,384,7,7) # num,chan,y,x
 Convolution(name="icp9_out0",bots=[ icp8_out ],tops=[ icp9_out0 ],filts=icp9_out0_filts,biases=icp9_out0_biases,
 	in_pad="0 0 0 0",stride="1 1")
 ReLU(name="relu_icp9_out0",in_place=[icp9_out0])
 icp9_out = NDA("icp9_out",num_img,1024,7,7) # num,chan,y,x
 Concat(name="icp9_out",bots=[ icp9_out0, icp9_out1, icp9_out2, icp9_out3 ],tops=[ icp9_out ],
 	in_pad="0 0 0 0",stride="1 1")
 cls3_pool = NDA("cls3_pool",num_img,1024,1,1) # num,chan,y,x
 Pooling(name="cls3_pool",bots=[ icp9_out ],tops=[ cls3_pool ],
 	in_pad="0 0 0 0",stride="1 1")
 Dropout(name="cls3_drop",in_place=[cls3_pool])
 cls3_fc__filts = NDA("cls3_fc__filts",33,1024,1,1) # SOURCE out_chan,in_chan,y,x
 cls3_fc__biases = NDA("cls3_fc__biases",33) # SOURCE out_chan
 cls3_fc_ = NDA("cls3_fc_",num_img,33,1,1) # SINK num,chan,y,x
 InnerProduct(name="cls3_fc_",bots=[ cls3_pool ],tops=[ cls3_fc_ ],filts=cls3_fc__filts,biases=cls3_fc__biases,
 	in_pad="0 0 0 0",stride="1 1")
diff --git a/googlenet_ft_fl.sgemm.py b/googlenet_ft_fl.sgemm.py
 data = NDA("data",num_img,3,224,224) # SOURCE num,chan,y,x
 conv1_filts = NDA("conv1_filts",64,3,7,7) # SOURCE out_chan,in_chan,y,x
 conv1_biases = NDA("conv1_biases",64) # SOURCE out_chan
 conv1 = NDA("conv1",num_img,64,112,112) # num,chan,y,x
 data_one_row_per_patch_buf = NDA("data_one_row_per_patch_buf",12544,147)
 for i in range(0,num_img):
  patches_to_rows( src=data[i,:,:,:], dest=data_one_row_per_patch_buf, in_pad="3 3 3 3",stride="2 2" ) # one copy per output elem
  conv1 = data_one_row_per_patch_buf * transpose(reshape(conv1_filts,64,147)) # sgemm: MxNxK == 12544x147x64
 ReLU(name="relu1",in_place=[conv1])
 pool1 = NDA("pool1",num_img,64,56,56) # num,chan,y,x
 Pooling(name="pool1",bots=[ conv1 ],tops=[ pool1 ],
 	in_pad="0 0 0 0",stride="2 2")
 norm1 = NDA("norm1",num_img,64,56,56) # num,chan,y,x
 LRN(name="norm1",bots=[ pool1 ],tops=[ norm1 ],
 	in_pad="0 0 0 0",stride="1 1")
 reduction2_filts = NDA("reduction2_filts",64,64,1,1) # SOURCE out_chan,in_chan,y,x
 reduction2_biases = NDA("reduction2_biases",64) # SOURCE out_chan
 reduction2 = NDA("reduction2",num_img,64,56,56) # num,chan,y,x
 norm1_one_row_per_patch_buf = NDA("norm1_one_row_per_patch_buf",3136,64)
 for i in range(0,num_img):
  patches_to_rows( src=norm1[i,:,:,:], dest=norm1_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  reduction2 = norm1_one_row_per_patch_buf * transpose(reshape(reduction2_filts,64,64)) # sgemm: MxNxK == 3136x64x64
 ReLU(name="relu_reduction2",in_place=[reduction2])
 conv2_filts = NDA("conv2_filts",192,64,3,3) # SOURCE out_chan,in_chan,y,x
 conv2_biases = NDA("conv2_biases",192) # SOURCE out_chan
 conv2 = NDA("conv2",num_img,192,56,56) # num,chan,y,x
 reduction2_one_row_per_patch_buf = NDA("reduction2_one_row_per_patch_buf",3136,576)
 for i in range(0,num_img):
  patches_to_rows( src=reduction2[i,:,:,:], dest=reduction2_one_row_per_patch_buf, in_pad="1 1 1 1",stride="1 1" ) # one copy per output elem
  conv2 = reduction2_one_row_per_patch_buf * transpose(reshape(conv2_filts,192,576)) # sgemm: MxNxK == 3136x576x192
 ReLU(name="relu2",in_place=[conv2])
 norm2 = NDA("norm2",num_img,192,56,56) # num,chan,y,x
 LRN(name="norm2",bots=[ conv2 ],tops=[ norm2 ],
 	in_pad="0 0 0 0",stride="1 1")
 pool2 = NDA("pool2",num_img,192,28,28) # num,chan,y,x
 Pooling(name="pool2",bots=[ norm2 ],tops=[ pool2 ],
 	in_pad="0 0 0 0",stride="2 2")
 icp1_reduction1_filts = NDA("icp1_reduction1_filts",96,192,1,1) # SOURCE out_chan,in_chan,y,x
 icp1_reduction1_biases = NDA("icp1_reduction1_biases",96) # SOURCE out_chan
 icp1_reduction1 = NDA("icp1_reduction1",num_img,96,28,28) # num,chan,y,x
 pool2_one_row_per_patch_buf = NDA("pool2_one_row_per_patch_buf",784,192)
 for i in range(0,num_img):
  patches_to_rows( src=pool2[i,:,:,:], dest=pool2_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp1_reduction1 = pool2_one_row_per_patch_buf * transpose(reshape(icp1_reduction1_filts,96,192)) # sgemm: MxNxK == 784x192x96
 ReLU(name="relu_icp1_reduction1",in_place=[icp1_reduction1])
 icp1_out1_filts = NDA("icp1_out1_filts",128,96,3,3) # SOURCE out_chan,in_chan,y,x
 icp1_out1_biases = NDA("icp1_out1_biases",128) # SOURCE out_chan
 icp1_out1 = NDA("icp1_out1",num_img,128,28,28) # num,chan,y,x
 icp1_reduction1_one_row_per_patch_buf = NDA("icp1_reduction1_one_row_per_patch_buf",784,864)
 for i in range(0,num_img):
  patches_to_rows( src=icp1_reduction1[i,:,:,:], dest=icp1_reduction1_one_row_per_patch_buf, in_pad="1 1 1 1",stride="1 1" ) # one copy per output elem
  icp1_out1 = icp1_reduction1_one_row_per_patch_buf * transpose(reshape(icp1_out1_filts,128,864)) # sgemm: MxNxK == 784x864x128
 ReLU(name="relu_icp1_out1",in_place=[icp1_out1])
 icp1_reduction2_filts = NDA("icp1_reduction2_filts",16,192,1,1) # SOURCE out_chan,in_chan,y,x
 icp1_reduction2_biases = NDA("icp1_reduction2_biases",16) # SOURCE out_chan
 icp1_reduction2 = NDA("icp1_reduction2",num_img,16,28,28) # num,chan,y,x
 pool2_one_row_per_patch_buf = NDA("pool2_one_row_per_patch_buf",784,192)
 for i in range(0,num_img):
  patches_to_rows( src=pool2[i,:,:,:], dest=pool2_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp1_reduction2 = pool2_one_row_per_patch_buf * transpose(reshape(icp1_reduction2_filts,16,192)) # sgemm: MxNxK == 784x192x16
 ReLU(name="relu_icp1_reduction2",in_place=[icp1_reduction2])
 icp1_out2_filts = NDA("icp1_out2_filts",32,16,5,5) # SOURCE out_chan,in_chan,y,x
 icp1_out2_biases = NDA("icp1_out2_biases",32) # SOURCE out_chan
 icp1_out2 = NDA("icp1_out2",num_img,32,28,28) # num,chan,y,x
 icp1_reduction2_one_row_per_patch_buf = NDA("icp1_reduction2_one_row_per_patch_buf",784,400)
 for i in range(0,num_img):
  patches_to_rows( src=icp1_reduction2[i,:,:,:], dest=icp1_reduction2_one_row_per_patch_buf, in_pad="2 2 2 2",stride="1 1" ) # one copy per output elem
  icp1_out2 = icp1_reduction2_one_row_per_patch_buf * transpose(reshape(icp1_out2_filts,32,400)) # sgemm: MxNxK == 784x400x32
 ReLU(name="relu_icp1_out2",in_place=[icp1_out2])
 icp1_pool = NDA("icp1_pool",num_img,192,28,28) # num,chan,y,x
 Pooling(name="icp1_pool",bots=[ pool2 ],tops=[ icp1_pool ],
 	in_pad="1 1 1 1",stride="1 1")
 icp1_out3_filts = NDA("icp1_out3_filts",32,192,1,1) # SOURCE out_chan,in_chan,y,x
 icp1_out3_biases = NDA("icp1_out3_biases",32) # SOURCE out_chan
 icp1_out3 = NDA("icp1_out3",num_img,32,28,28) # num,chan,y,x
 icp1_pool_one_row_per_patch_buf = NDA("icp1_pool_one_row_per_patch_buf",784,192)
 for i in range(0,num_img):
  patches_to_rows( src=icp1_pool[i,:,:,:], dest=icp1_pool_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp1_out3 = icp1_pool_one_row_per_patch_buf * transpose(reshape(icp1_out3_filts,32,192)) # sgemm: MxNxK == 784x192x32
 ReLU(name="relu_icp1_out3",in_place=[icp1_out3])
 icp1_out0_filts = NDA("icp1_out0_filts",64,192,1,1) # SOURCE out_chan,in_chan,y,x
 icp1_out0_biases = NDA("icp1_out0_biases",64) # SOURCE out_chan
 icp1_out0 = NDA("icp1_out0",num_img,64,28,28) # num,chan,y,x
 pool2_one_row_per_patch_buf = NDA("pool2_one_row_per_patch_buf",784,192)
 for i in range(0,num_img):
  patches_to_rows( src=pool2[i,:,:,:], dest=pool2_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp1_out0 = pool2_one_row_per_patch_buf * transpose(reshape(icp1_out0_filts,64,192)) # sgemm: MxNxK == 784x192x64
 ReLU(name="relu_icp1_out0",in_place=[icp1_out0])
 icp2_in = NDA("icp2_in",num_img,256,28,28) # num,chan,y,x
 Concat(name="icp2_in",bots=[ icp1_out0, icp1_out1, icp1_out2, icp1_out3 ],tops=[ icp2_in ],
 	in_pad="0 0 0 0",stride="1 1")
 icp2_reduction1_filts = NDA("icp2_reduction1_filts",128,256,1,1) # SOURCE out_chan,in_chan,y,x
 icp2_reduction1_biases = NDA("icp2_reduction1_biases",128) # SOURCE out_chan
 icp2_reduction1 = NDA("icp2_reduction1",num_img,128,28,28) # num,chan,y,x
 icp2_in_one_row_per_patch_buf = NDA("icp2_in_one_row_per_patch_buf",784,256)
 for i in range(0,num_img):
  patches_to_rows( src=icp2_in[i,:,:,:], dest=icp2_in_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp2_reduction1 = icp2_in_one_row_per_patch_buf * transpose(reshape(icp2_reduction1_filts,128,256)) # sgemm: MxNxK == 784x256x128
 ReLU(name="relu_icp2_reduction1",in_place=[icp2_reduction1])
 icp2_out1_filts = NDA("icp2_out1_filts",192,128,3,3) # SOURCE out_chan,in_chan,y,x
 icp2_out1_biases = NDA("icp2_out1_biases",192) # SOURCE out_chan
 icp2_out1 = NDA("icp2_out1",num_img,192,28,28) # num,chan,y,x
 icp2_reduction1_one_row_per_patch_buf = NDA("icp2_reduction1_one_row_per_patch_buf",784,1152)
 for i in range(0,num_img):
  patches_to_rows( src=icp2_reduction1[i,:,:,:], dest=icp2_reduction1_one_row_per_patch_buf, in_pad="1 1 1 1",stride="1 1" ) # one copy per output elem
  icp2_out1 = icp2_reduction1_one_row_per_patch_buf * transpose(reshape(icp2_out1_filts,192,1152)) # sgemm: MxNxK == 784x1152x192
 ReLU(name="relu_icp2_out1",in_place=[icp2_out1])
 icp2_reduction2_filts = NDA("icp2_reduction2_filts",32,256,1,1) # SOURCE out_chan,in_chan,y,x
 icp2_reduction2_biases = NDA("icp2_reduction2_biases",32) # SOURCE out_chan
 icp2_reduction2 = NDA("icp2_reduction2",num_img,32,28,28) # num,chan,y,x
 icp2_in_one_row_per_patch_buf = NDA("icp2_in_one_row_per_patch_buf",784,256)
 for i in range(0,num_img):
  patches_to_rows( src=icp2_in[i,:,:,:], dest=icp2_in_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp2_reduction2 = icp2_in_one_row_per_patch_buf * transpose(reshape(icp2_reduction2_filts,32,256)) # sgemm: MxNxK == 784x256x32
 ReLU(name="relu_icp2_reduction2",in_place=[icp2_reduction2])
 icp2_out2_filts = NDA("icp2_out2_filts",96,32,5,5) # SOURCE out_chan,in_chan,y,x
 icp2_out2_biases = NDA("icp2_out2_biases",96) # SOURCE out_chan
 icp2_out2 = NDA("icp2_out2",num_img,96,28,28) # num,chan,y,x
 icp2_reduction2_one_row_per_patch_buf = NDA("icp2_reduction2_one_row_per_patch_buf",784,800)
 for i in range(0,num_img):
  patches_to_rows( src=icp2_reduction2[i,:,:,:], dest=icp2_reduction2_one_row_per_patch_buf, in_pad="2 2 2 2",stride="1 1" ) # one copy per output elem
  icp2_out2 = icp2_reduction2_one_row_per_patch_buf * transpose(reshape(icp2_out2_filts,96,800)) # sgemm: MxNxK == 784x800x96
 ReLU(name="relu_icp2_out2",in_place=[icp2_out2])
 icp2_pool = NDA("icp2_pool",num_img,256,28,28) # num,chan,y,x
 Pooling(name="icp2_pool",bots=[ icp2_in ],tops=[ icp2_pool ],
 	in_pad="1 1 1 1",stride="1 1")
 icp2_out3_filts = NDA("icp2_out3_filts",64,256,1,1) # SOURCE out_chan,in_chan,y,x
 icp2_out3_biases = NDA("icp2_out3_biases",64) # SOURCE out_chan
 icp2_out3 = NDA("icp2_out3",num_img,64,28,28) # num,chan,y,x
 icp2_pool_one_row_per_patch_buf = NDA("icp2_pool_one_row_per_patch_buf",784,256)
 for i in range(0,num_img):
  patches_to_rows( src=icp2_pool[i,:,:,:], dest=icp2_pool_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp2_out3 = icp2_pool_one_row_per_patch_buf * transpose(reshape(icp2_out3_filts,64,256)) # sgemm: MxNxK == 784x256x64
 ReLU(name="relu_icp2_out3",in_place=[icp2_out3])
 icp2_out0_filts = NDA("icp2_out0_filts",128,256,1,1) # SOURCE out_chan,in_chan,y,x
 icp2_out0_biases = NDA("icp2_out0_biases",128) # SOURCE out_chan
 icp2_out0 = NDA("icp2_out0",num_img,128,28,28) # num,chan,y,x
 icp2_in_one_row_per_patch_buf = NDA("icp2_in_one_row_per_patch_buf",784,256)
 for i in range(0,num_img):
  patches_to_rows( src=icp2_in[i,:,:,:], dest=icp2_in_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp2_out0 = icp2_in_one_row_per_patch_buf * transpose(reshape(icp2_out0_filts,128,256)) # sgemm: MxNxK == 784x256x128
 ReLU(name="relu_icp2_out0",in_place=[icp2_out0])
 icp2_out = NDA("icp2_out",num_img,480,28,28) # num,chan,y,x
 Concat(name="icp2_out",bots=[ icp2_out0, icp2_out1, icp2_out2, icp2_out3 ],tops=[ icp2_out ],
 	in_pad="0 0 0 0",stride="1 1")
 icp3_in = NDA("icp3_in",num_img,480,14,14) # num,chan,y,x
 Pooling(name="icp3_in",bots=[ icp2_out ],tops=[ icp3_in ],
 	in_pad="0 0 0 0",stride="2 2")
 icp3_reduction1_filts = NDA("icp3_reduction1_filts",96,480,1,1) # SOURCE out_chan,in_chan,y,x
 icp3_reduction1_biases = NDA("icp3_reduction1_biases",96) # SOURCE out_chan
 icp3_reduction1 = NDA("icp3_reduction1",num_img,96,14,14) # num,chan,y,x
 icp3_in_one_row_per_patch_buf = NDA("icp3_in_one_row_per_patch_buf",196,480)
 for i in range(0,num_img):
  patches_to_rows( src=icp3_in[i,:,:,:], dest=icp3_in_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp3_reduction1 = icp3_in_one_row_per_patch_buf * transpose(reshape(icp3_reduction1_filts,96,480)) # sgemm: MxNxK == 196x480x96
 ReLU(name="relu_icp3_reduction1",in_place=[icp3_reduction1])
 icp3_out1_filts = NDA("icp3_out1_filts",208,96,3,3) # SOURCE out_chan,in_chan,y,x
 icp3_out1_biases = NDA("icp3_out1_biases",208) # SOURCE out_chan
 icp3_out1 = NDA("icp3_out1",num_img,208,14,14) # num,chan,y,x
 icp3_reduction1_one_row_per_patch_buf = NDA("icp3_reduction1_one_row_per_patch_buf",196,864)
 for i in range(0,num_img):
  patches_to_rows( src=icp3_reduction1[i,:,:,:], dest=icp3_reduction1_one_row_per_patch_buf, in_pad="1 1 1 1",stride="1 1" ) # one copy per output elem
  icp3_out1 = icp3_reduction1_one_row_per_patch_buf * transpose(reshape(icp3_out1_filts,208,864)) # sgemm: MxNxK == 196x864x208
 ReLU(name="relu_icp3_out1",in_place=[icp3_out1])
 icp3_reduction2_filts = NDA("icp3_reduction2_filts",16,480,1,1) # SOURCE out_chan,in_chan,y,x
 icp3_reduction2_biases = NDA("icp3_reduction2_biases",16) # SOURCE out_chan
 icp3_reduction2 = NDA("icp3_reduction2",num_img,16,14,14) # num,chan,y,x
 icp3_in_one_row_per_patch_buf = NDA("icp3_in_one_row_per_patch_buf",196,480)
 for i in range(0,num_img):
  patches_to_rows( src=icp3_in[i,:,:,:], dest=icp3_in_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp3_reduction2 = icp3_in_one_row_per_patch_buf * transpose(reshape(icp3_reduction2_filts,16,480)) # sgemm: MxNxK == 196x480x16
 ReLU(name="relu_icp3_reduction2",in_place=[icp3_reduction2])
 icp3_out2_filts = NDA("icp3_out2_filts",48,16,5,5) # SOURCE out_chan,in_chan,y,x
 icp3_out2_biases = NDA("icp3_out2_biases",48) # SOURCE out_chan
 icp3_out2 = NDA("icp3_out2",num_img,48,14,14) # num,chan,y,x
 icp3_reduction2_one_row_per_patch_buf = NDA("icp3_reduction2_one_row_per_patch_buf",196,400)
 for i in range(0,num_img):
  patches_to_rows( src=icp3_reduction2[i,:,:,:], dest=icp3_reduction2_one_row_per_patch_buf, in_pad="2 2 2 2",stride="1 1" ) # one copy per output elem
  icp3_out2 = icp3_reduction2_one_row_per_patch_buf * transpose(reshape(icp3_out2_filts,48,400)) # sgemm: MxNxK == 196x400x48
 ReLU(name="relu_icp3_out2",in_place=[icp3_out2])
 icp3_pool = NDA("icp3_pool",num_img,480,14,14) # num,chan,y,x
 Pooling(name="icp3_pool",bots=[ icp3_in ],tops=[ icp3_pool ],
 	in_pad="1 1 1 1",stride="1 1")
 icp3_out3_filts = NDA("icp3_out3_filts",64,480,1,1) # SOURCE out_chan,in_chan,y,x
 icp3_out3_biases = NDA("icp3_out3_biases",64) # SOURCE out_chan
 icp3_out3 = NDA("icp3_out3",num_img,64,14,14) # num,chan,y,x
 icp3_pool_one_row_per_patch_buf = NDA("icp3_pool_one_row_per_patch_buf",196,480)
 for i in range(0,num_img):
  patches_to_rows( src=icp3_pool[i,:,:,:], dest=icp3_pool_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp3_out3 = icp3_pool_one_row_per_patch_buf * transpose(reshape(icp3_out3_filts,64,480)) # sgemm: MxNxK == 196x480x64
 ReLU(name="relu_icp3_out3",in_place=[icp3_out3])
 icp3_out0_filts = NDA("icp3_out0_filts",192,480,1,1) # SOURCE out_chan,in_chan,y,x
 icp3_out0_biases = NDA("icp3_out0_biases",192) # SOURCE out_chan
 icp3_out0 = NDA("icp3_out0",num_img,192,14,14) # num,chan,y,x
 icp3_in_one_row_per_patch_buf = NDA("icp3_in_one_row_per_patch_buf",196,480)
 for i in range(0,num_img):
  patches_to_rows( src=icp3_in[i,:,:,:], dest=icp3_in_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp3_out0 = icp3_in_one_row_per_patch_buf * transpose(reshape(icp3_out0_filts,192,480)) # sgemm: MxNxK == 196x480x192
 ReLU(name="relu_icp3_out0",in_place=[icp3_out0])
 icp3_out = NDA("icp3_out",num_img,512,14,14) # num,chan,y,x
 Concat(name="icp3_out",bots=[ icp3_out0, icp3_out1, icp3_out2, icp3_out3 ],tops=[ icp3_out ],
 	in_pad="0 0 0 0",stride="1 1")
 cls1_pool = NDA("cls1_pool",num_img,512,4,4) # num,chan,y,x
 Pooling(name="cls1_pool",bots=[ icp3_out ],tops=[ cls1_pool ],
 	in_pad="0 0 0 0",stride="3 3")
 cls1_reduction_filts = NDA("cls1_reduction_filts",128,512,1,1) # SOURCE out_chan,in_chan,y,x
 cls1_reduction_biases = NDA("cls1_reduction_biases",128) # SOURCE out_chan
 cls1_reduction = NDA("cls1_reduction",num_img,128,4,4) # num,chan,y,x
 cls1_pool_one_row_per_patch_buf = NDA("cls1_pool_one_row_per_patch_buf",16,512)
 for i in range(0,num_img):
  patches_to_rows( src=cls1_pool[i,:,:,:], dest=cls1_pool_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  cls1_reduction = cls1_pool_one_row_per_patch_buf * transpose(reshape(cls1_reduction_filts,128,512)) # sgemm: MxNxK == 16x512x128
 ReLU(name="relu_cls1_reduction",in_place=[cls1_reduction])
 cls1_fc1_filts = NDA("cls1_fc1_filts",1024,128,4,4) # SOURCE out_chan,in_chan,y,x
 cls1_fc1_biases = NDA("cls1_fc1_biases",1024) # SOURCE out_chan
 cls1_fc1 = NDA("cls1_fc1",num_img,1024,1,1) # num,chan,y,x
 cls1_reduction_one_row_per_patch_buf = NDA("cls1_reduction_one_row_per_patch_buf",1,2048)
 for i in range(0,num_img):
  patches_to_rows( src=cls1_reduction[i,:,:,:], dest=cls1_reduction_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  cls1_fc1 = cls1_reduction_one_row_per_patch_buf * transpose(reshape(cls1_fc1_filts,1024,2048)) # sgemm: MxNxK == 1x2048x1024
 ReLU(name="relu_cls1_fc1",in_place=[cls1_fc1])
 Dropout(name="cls1_drop",in_place=[cls1_fc1])
 cls1_fc2__filts = NDA("cls1_fc2__filts",33,1024,1,1) # SOURCE out_chan,in_chan,y,x
 cls1_fc2__biases = NDA("cls1_fc2__biases",33) # SOURCE out_chan
 cls1_fc2_ = NDA("cls1_fc2_",num_img,33,1,1) # SINK num,chan,y,x
 cls1_fc1_one_row_per_patch_buf = NDA("cls1_fc1_one_row_per_patch_buf",1,1024)
 for i in range(0,num_img):
  patches_to_rows( src=cls1_fc1[i,:,:,:], dest=cls1_fc1_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  cls1_fc2_ = cls1_fc1_one_row_per_patch_buf * transpose(reshape(cls1_fc2__filts,33,1024)) # sgemm: MxNxK == 1x1024x33
 icp4_reduction1_filts = NDA("icp4_reduction1_filts",112,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp4_reduction1_biases = NDA("icp4_reduction1_biases",112) # SOURCE out_chan
 icp4_reduction1 = NDA("icp4_reduction1",num_img,112,14,14) # num,chan,y,x
 icp3_out_one_row_per_patch_buf = NDA("icp3_out_one_row_per_patch_buf",196,512)
 for i in range(0,num_img):
  patches_to_rows( src=icp3_out[i,:,:,:], dest=icp3_out_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp4_reduction1 = icp3_out_one_row_per_patch_buf * transpose(reshape(icp4_reduction1_filts,112,512)) # sgemm: MxNxK == 196x512x112
 ReLU(name="relu_icp4_reduction1",in_place=[icp4_reduction1])
 icp4_out1_filts = NDA("icp4_out1_filts",224,112,3,3) # SOURCE out_chan,in_chan,y,x
 icp4_out1_biases = NDA("icp4_out1_biases",224) # SOURCE out_chan
 icp4_out1 = NDA("icp4_out1",num_img,224,14,14) # num,chan,y,x
 icp4_reduction1_one_row_per_patch_buf = NDA("icp4_reduction1_one_row_per_patch_buf",196,1008)
 for i in range(0,num_img):
  patches_to_rows( src=icp4_reduction1[i,:,:,:], dest=icp4_reduction1_one_row_per_patch_buf, in_pad="1 1 1 1",stride="1 1" ) # one copy per output elem
  icp4_out1 = icp4_reduction1_one_row_per_patch_buf * transpose(reshape(icp4_out1_filts,224,1008)) # sgemm: MxNxK == 196x1008x224
 ReLU(name="relu_icp4_out1",in_place=[icp4_out1])
 icp4_reduction2_filts = NDA("icp4_reduction2_filts",24,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp4_reduction2_biases = NDA("icp4_reduction2_biases",24) # SOURCE out_chan
 icp4_reduction2 = NDA("icp4_reduction2",num_img,24,14,14) # num,chan,y,x
 icp3_out_one_row_per_patch_buf = NDA("icp3_out_one_row_per_patch_buf",196,512)
 for i in range(0,num_img):
  patches_to_rows( src=icp3_out[i,:,:,:], dest=icp3_out_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp4_reduction2 = icp3_out_one_row_per_patch_buf * transpose(reshape(icp4_reduction2_filts,24,512)) # sgemm: MxNxK == 196x512x24
 ReLU(name="relu_icp4_reduction2",in_place=[icp4_reduction2])
 icp4_out2_filts = NDA("icp4_out2_filts",64,24,5,5) # SOURCE out_chan,in_chan,y,x
 icp4_out2_biases = NDA("icp4_out2_biases",64) # SOURCE out_chan
 icp4_out2 = NDA("icp4_out2",num_img,64,14,14) # num,chan,y,x
 icp4_reduction2_one_row_per_patch_buf = NDA("icp4_reduction2_one_row_per_patch_buf",196,600)
 for i in range(0,num_img):
  patches_to_rows( src=icp4_reduction2[i,:,:,:], dest=icp4_reduction2_one_row_per_patch_buf, in_pad="2 2 2 2",stride="1 1" ) # one copy per output elem
  icp4_out2 = icp4_reduction2_one_row_per_patch_buf * transpose(reshape(icp4_out2_filts,64,600)) # sgemm: MxNxK == 196x600x64
 ReLU(name="relu_icp4_out2",in_place=[icp4_out2])
 icp4_pool = NDA("icp4_pool",num_img,512,14,14) # num,chan,y,x
 Pooling(name="icp4_pool",bots=[ icp3_out ],tops=[ icp4_pool ],
 	in_pad="1 1 1 1",stride="1 1")
 icp4_out3_filts = NDA("icp4_out3_filts",64,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp4_out3_biases = NDA("icp4_out3_biases",64) # SOURCE out_chan
 icp4_out3 = NDA("icp4_out3",num_img,64,14,14) # num,chan,y,x
 icp4_pool_one_row_per_patch_buf = NDA("icp4_pool_one_row_per_patch_buf",196,512)
 for i in range(0,num_img):
  patches_to_rows( src=icp4_pool[i,:,:,:], dest=icp4_pool_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp4_out3 = icp4_pool_one_row_per_patch_buf * transpose(reshape(icp4_out3_filts,64,512)) # sgemm: MxNxK == 196x512x64
 ReLU(name="relu_icp4_out3",in_place=[icp4_out3])
 icp4_out0_filts = NDA("icp4_out0_filts",160,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp4_out0_biases = NDA("icp4_out0_biases",160) # SOURCE out_chan
 icp4_out0 = NDA("icp4_out0",num_img,160,14,14) # num,chan,y,x
 icp3_out_one_row_per_patch_buf = NDA("icp3_out_one_row_per_patch_buf",196,512)
 for i in range(0,num_img):
  patches_to_rows( src=icp3_out[i,:,:,:], dest=icp3_out_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp4_out0 = icp3_out_one_row_per_patch_buf * transpose(reshape(icp4_out0_filts,160,512)) # sgemm: MxNxK == 196x512x160
 ReLU(name="relu_icp4_out0",in_place=[icp4_out0])
 icp4_out = NDA("icp4_out",num_img,512,14,14) # num,chan,y,x
 Concat(name="icp4_out",bots=[ icp4_out0, icp4_out1, icp4_out2, icp4_out3 ],tops=[ icp4_out ],
 	in_pad="0 0 0 0",stride="1 1")
 icp5_reduction1_filts = NDA("icp5_reduction1_filts",128,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp5_reduction1_biases = NDA("icp5_reduction1_biases",128) # SOURCE out_chan
 icp5_reduction1 = NDA("icp5_reduction1",num_img,128,14,14) # num,chan,y,x
 icp4_out_one_row_per_patch_buf = NDA("icp4_out_one_row_per_patch_buf",196,512)
 for i in range(0,num_img):
  patches_to_rows( src=icp4_out[i,:,:,:], dest=icp4_out_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp5_reduction1 = icp4_out_one_row_per_patch_buf * transpose(reshape(icp5_reduction1_filts,128,512)) # sgemm: MxNxK == 196x512x128
 ReLU(name="relu_icp5_reduction1",in_place=[icp5_reduction1])
 icp5_out1_filts = NDA("icp5_out1_filts",256,128,3,3) # SOURCE out_chan,in_chan,y,x
 icp5_out1_biases = NDA("icp5_out1_biases",256) # SOURCE out_chan
 icp5_out1 = NDA("icp5_out1",num_img,256,14,14) # num,chan,y,x
 icp5_reduction1_one_row_per_patch_buf = NDA("icp5_reduction1_one_row_per_patch_buf",196,1152)
 for i in range(0,num_img):
  patches_to_rows( src=icp5_reduction1[i,:,:,:], dest=icp5_reduction1_one_row_per_patch_buf, in_pad="1 1 1 1",stride="1 1" ) # one copy per output elem
  icp5_out1 = icp5_reduction1_one_row_per_patch_buf * transpose(reshape(icp5_out1_filts,256,1152)) # sgemm: MxNxK == 196x1152x256
 ReLU(name="relu_icp5_out1",in_place=[icp5_out1])
 icp5_reduction2_filts = NDA("icp5_reduction2_filts",24,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp5_reduction2_biases = NDA("icp5_reduction2_biases",24) # SOURCE out_chan
 icp5_reduction2 = NDA("icp5_reduction2",num_img,24,14,14) # num,chan,y,x
 icp4_out_one_row_per_patch_buf = NDA("icp4_out_one_row_per_patch_buf",196,512)
 for i in range(0,num_img):
  patches_to_rows( src=icp4_out[i,:,:,:], dest=icp4_out_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp5_reduction2 = icp4_out_one_row_per_patch_buf * transpose(reshape(icp5_reduction2_filts,24,512)) # sgemm: MxNxK == 196x512x24
 ReLU(name="relu_icp5_reduction2",in_place=[icp5_reduction2])
 icp5_out2_filts = NDA("icp5_out2_filts",64,24,5,5) # SOURCE out_chan,in_chan,y,x
 icp5_out2_biases = NDA("icp5_out2_biases",64) # SOURCE out_chan
 icp5_out2 = NDA("icp5_out2",num_img,64,14,14) # num,chan,y,x
 icp5_reduction2_one_row_per_patch_buf = NDA("icp5_reduction2_one_row_per_patch_buf",196,600)
 for i in range(0,num_img):
  patches_to_rows( src=icp5_reduction2[i,:,:,:], dest=icp5_reduction2_one_row_per_patch_buf, in_pad="2 2 2 2",stride="1 1" ) # one copy per output elem
  icp5_out2 = icp5_reduction2_one_row_per_patch_buf * transpose(reshape(icp5_out2_filts,64,600)) # sgemm: MxNxK == 196x600x64
 ReLU(name="relu_icp5_out2",in_place=[icp5_out2])
 icp5_pool = NDA("icp5_pool",num_img,512,14,14) # num,chan,y,x
 Pooling(name="icp5_pool",bots=[ icp4_out ],tops=[ icp5_pool ],
 	in_pad="1 1 1 1",stride="1 1")
 icp5_out3_filts = NDA("icp5_out3_filts",64,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp5_out3_biases = NDA("icp5_out3_biases",64) # SOURCE out_chan
 icp5_out3 = NDA("icp5_out3",num_img,64,14,14) # num,chan,y,x
 icp5_pool_one_row_per_patch_buf = NDA("icp5_pool_one_row_per_patch_buf",196,512)
 for i in range(0,num_img):
  patches_to_rows( src=icp5_pool[i,:,:,:], dest=icp5_pool_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp5_out3 = icp5_pool_one_row_per_patch_buf * transpose(reshape(icp5_out3_filts,64,512)) # sgemm: MxNxK == 196x512x64
 ReLU(name="relu_icp5_out3",in_place=[icp5_out3])
 icp5_out0_filts = NDA("icp5_out0_filts",128,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp5_out0_biases = NDA("icp5_out0_biases",128) # SOURCE out_chan
 icp5_out0 = NDA("icp5_out0",num_img,128,14,14) # num,chan,y,x
 icp4_out_one_row_per_patch_buf = NDA("icp4_out_one_row_per_patch_buf",196,512)
 for i in range(0,num_img):
  patches_to_rows( src=icp4_out[i,:,:,:], dest=icp4_out_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp5_out0 = icp4_out_one_row_per_patch_buf * transpose(reshape(icp5_out0_filts,128,512)) # sgemm: MxNxK == 196x512x128
 ReLU(name="relu_icp5_out0",in_place=[icp5_out0])
 icp5_out = NDA("icp5_out",num_img,512,14,14) # num,chan,y,x
 Concat(name="icp5_out",bots=[ icp5_out0, icp5_out1, icp5_out2, icp5_out3 ],tops=[ icp5_out ],
 	in_pad="0 0 0 0",stride="1 1")
 icp6_reduction1_filts = NDA("icp6_reduction1_filts",144,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp6_reduction1_biases = NDA("icp6_reduction1_biases",144) # SOURCE out_chan
 icp6_reduction1 = NDA("icp6_reduction1",num_img,144,14,14) # num,chan,y,x
 icp5_out_one_row_per_patch_buf = NDA("icp5_out_one_row_per_patch_buf",196,512)
 for i in range(0,num_img):
  patches_to_rows( src=icp5_out[i,:,:,:], dest=icp5_out_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp6_reduction1 = icp5_out_one_row_per_patch_buf * transpose(reshape(icp6_reduction1_filts,144,512)) # sgemm: MxNxK == 196x512x144
 ReLU(name="relu_icp6_reduction1",in_place=[icp6_reduction1])
 icp6_out1_filts = NDA("icp6_out1_filts",288,144,3,3) # SOURCE out_chan,in_chan,y,x
 icp6_out1_biases = NDA("icp6_out1_biases",288) # SOURCE out_chan
 icp6_out1 = NDA("icp6_out1",num_img,288,14,14) # num,chan,y,x
 icp6_reduction1_one_row_per_patch_buf = NDA("icp6_reduction1_one_row_per_patch_buf",196,1296)
 for i in range(0,num_img):
  patches_to_rows( src=icp6_reduction1[i,:,:,:], dest=icp6_reduction1_one_row_per_patch_buf, in_pad="1 1 1 1",stride="1 1" ) # one copy per output elem
  icp6_out1 = icp6_reduction1_one_row_per_patch_buf * transpose(reshape(icp6_out1_filts,288,1296)) # sgemm: MxNxK == 196x1296x288
 ReLU(name="relu_icp6_out1",in_place=[icp6_out1])
 icp6_reduction2_filts = NDA("icp6_reduction2_filts",32,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp6_reduction2_biases = NDA("icp6_reduction2_biases",32) # SOURCE out_chan
 icp6_reduction2 = NDA("icp6_reduction2",num_img,32,14,14) # num,chan,y,x
 icp5_out_one_row_per_patch_buf = NDA("icp5_out_one_row_per_patch_buf",196,512)
 for i in range(0,num_img):
  patches_to_rows( src=icp5_out[i,:,:,:], dest=icp5_out_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp6_reduction2 = icp5_out_one_row_per_patch_buf * transpose(reshape(icp6_reduction2_filts,32,512)) # sgemm: MxNxK == 196x512x32
 ReLU(name="relu_icp6_reduction2",in_place=[icp6_reduction2])
 icp6_out2_filts = NDA("icp6_out2_filts",64,32,5,5) # SOURCE out_chan,in_chan,y,x
 icp6_out2_biases = NDA("icp6_out2_biases",64) # SOURCE out_chan
 icp6_out2 = NDA("icp6_out2",num_img,64,14,14) # num,chan,y,x
 icp6_reduction2_one_row_per_patch_buf = NDA("icp6_reduction2_one_row_per_patch_buf",196,800)
 for i in range(0,num_img):
  patches_to_rows( src=icp6_reduction2[i,:,:,:], dest=icp6_reduction2_one_row_per_patch_buf, in_pad="2 2 2 2",stride="1 1" ) # one copy per output elem
  icp6_out2 = icp6_reduction2_one_row_per_patch_buf * transpose(reshape(icp6_out2_filts,64,800)) # sgemm: MxNxK == 196x800x64
 ReLU(name="relu_icp6_out2",in_place=[icp6_out2])
 icp6_pool = NDA("icp6_pool",num_img,512,14,14) # num,chan,y,x
 Pooling(name="icp6_pool",bots=[ icp5_out ],tops=[ icp6_pool ],
 	in_pad="1 1 1 1",stride="1 1")
 icp6_out3_filts = NDA("icp6_out3_filts",64,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp6_out3_biases = NDA("icp6_out3_biases",64) # SOURCE out_chan
 icp6_out3 = NDA("icp6_out3",num_img,64,14,14) # num,chan,y,x
 icp6_pool_one_row_per_patch_buf = NDA("icp6_pool_one_row_per_patch_buf",196,512)
 for i in range(0,num_img):
  patches_to_rows( src=icp6_pool[i,:,:,:], dest=icp6_pool_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp6_out3 = icp6_pool_one_row_per_patch_buf * transpose(reshape(icp6_out3_filts,64,512)) # sgemm: MxNxK == 196x512x64
 ReLU(name="relu_icp6_out3",in_place=[icp6_out3])
 icp6_out0_filts = NDA("icp6_out0_filts",112,512,1,1) # SOURCE out_chan,in_chan,y,x
 icp6_out0_biases = NDA("icp6_out0_biases",112) # SOURCE out_chan
 icp6_out0 = NDA("icp6_out0",num_img,112,14,14) # num,chan,y,x
 icp5_out_one_row_per_patch_buf = NDA("icp5_out_one_row_per_patch_buf",196,512)
 for i in range(0,num_img):
  patches_to_rows( src=icp5_out[i,:,:,:], dest=icp5_out_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp6_out0 = icp5_out_one_row_per_patch_buf * transpose(reshape(icp6_out0_filts,112,512)) # sgemm: MxNxK == 196x512x112
 ReLU(name="relu_icp6_out0",in_place=[icp6_out0])
 icp6_out = NDA("icp6_out",num_img,528,14,14) # num,chan,y,x
 Concat(name="icp6_out",bots=[ icp6_out0, icp6_out1, icp6_out2, icp6_out3 ],tops=[ icp6_out ],
 	in_pad="0 0 0 0",stride="1 1")
 cls2_pool = NDA("cls2_pool",num_img,528,4,4) # num,chan,y,x
 Pooling(name="cls2_pool",bots=[ icp6_out ],tops=[ cls2_pool ],
 	in_pad="0 0 0 0",stride="3 3")
 cls2_reduction_filts = NDA("cls2_reduction_filts",128,528,1,1) # SOURCE out_chan,in_chan,y,x
 cls2_reduction_biases = NDA("cls2_reduction_biases",128) # SOURCE out_chan
 cls2_reduction = NDA("cls2_reduction",num_img,128,4,4) # num,chan,y,x
 cls2_pool_one_row_per_patch_buf = NDA("cls2_pool_one_row_per_patch_buf",16,528)
 for i in range(0,num_img):
  patches_to_rows( src=cls2_pool[i,:,:,:], dest=cls2_pool_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  cls2_reduction = cls2_pool_one_row_per_patch_buf * transpose(reshape(cls2_reduction_filts,128,528)) # sgemm: MxNxK == 16x528x128
 ReLU(name="relu_cls2_reduction",in_place=[cls2_reduction])
 cls2_fc1_filts = NDA("cls2_fc1_filts",1024,128,4,4) # SOURCE out_chan,in_chan,y,x
 cls2_fc1_biases = NDA("cls2_fc1_biases",1024) # SOURCE out_chan
 cls2_fc1 = NDA("cls2_fc1",num_img,1024,1,1) # num,chan,y,x
 cls2_reduction_one_row_per_patch_buf = NDA("cls2_reduction_one_row_per_patch_buf",1,2048)
 for i in range(0,num_img):
  patches_to_rows( src=cls2_reduction[i,:,:,:], dest=cls2_reduction_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  cls2_fc1 = cls2_reduction_one_row_per_patch_buf * transpose(reshape(cls2_fc1_filts,1024,2048)) # sgemm: MxNxK == 1x2048x1024
 ReLU(name="relu_cls2_fc1",in_place=[cls2_fc1])
 Dropout(name="cls2_drop",in_place=[cls2_fc1])
 cls2_fc2__filts = NDA("cls2_fc2__filts",33,1024,1,1) # SOURCE out_chan,in_chan,y,x
 cls2_fc2__biases = NDA("cls2_fc2__biases",33) # SOURCE out_chan
 cls2_fc2_ = NDA("cls2_fc2_",num_img,33,1,1) # SINK num,chan,y,x
 cls2_fc1_one_row_per_patch_buf = NDA("cls2_fc1_one_row_per_patch_buf",1,1024)
 for i in range(0,num_img):
  patches_to_rows( src=cls2_fc1[i,:,:,:], dest=cls2_fc1_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  cls2_fc2_ = cls2_fc1_one_row_per_patch_buf * transpose(reshape(cls2_fc2__filts,33,1024)) # sgemm: MxNxK == 1x1024x33
 icp7_reduction1_filts = NDA("icp7_reduction1_filts",160,528,1,1) # SOURCE out_chan,in_chan,y,x
 icp7_reduction1_biases = NDA("icp7_reduction1_biases",160) # SOURCE out_chan
 icp7_reduction1 = NDA("icp7_reduction1",num_img,160,14,14) # num,chan,y,x
 icp6_out_one_row_per_patch_buf = NDA("icp6_out_one_row_per_patch_buf",196,528)
 for i in range(0,num_img):
  patches_to_rows( src=icp6_out[i,:,:,:], dest=icp6_out_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp7_reduction1 = icp6_out_one_row_per_patch_buf * transpose(reshape(icp7_reduction1_filts,160,528)) # sgemm: MxNxK == 196x528x160
 ReLU(name="relu_icp7_reduction1",in_place=[icp7_reduction1])
 icp7_out1_filts = NDA("icp7_out1_filts",320,160,3,3) # SOURCE out_chan,in_chan,y,x
 icp7_out1_biases = NDA("icp7_out1_biases",320) # SOURCE out_chan
 icp7_out1 = NDA("icp7_out1",num_img,320,14,14) # num,chan,y,x
 icp7_reduction1_one_row_per_patch_buf = NDA("icp7_reduction1_one_row_per_patch_buf",196,1440)
 for i in range(0,num_img):
  patches_to_rows( src=icp7_reduction1[i,:,:,:], dest=icp7_reduction1_one_row_per_patch_buf, in_pad="1 1 1 1",stride="1 1" ) # one copy per output elem
  icp7_out1 = icp7_reduction1_one_row_per_patch_buf * transpose(reshape(icp7_out1_filts,320,1440)) # sgemm: MxNxK == 196x1440x320
 ReLU(name="relu_icp7_out1",in_place=[icp7_out1])
 icp7_reduction2_filts = NDA("icp7_reduction2_filts",32,528,1,1) # SOURCE out_chan,in_chan,y,x
 icp7_reduction2_biases = NDA("icp7_reduction2_biases",32) # SOURCE out_chan
 icp7_reduction2 = NDA("icp7_reduction2",num_img,32,14,14) # num,chan,y,x
 icp6_out_one_row_per_patch_buf = NDA("icp6_out_one_row_per_patch_buf",196,528)
 for i in range(0,num_img):
  patches_to_rows( src=icp6_out[i,:,:,:], dest=icp6_out_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp7_reduction2 = icp6_out_one_row_per_patch_buf * transpose(reshape(icp7_reduction2_filts,32,528)) # sgemm: MxNxK == 196x528x32
 ReLU(name="relu_icp7_reduction2",in_place=[icp7_reduction2])
 icp7_out2_filts = NDA("icp7_out2_filts",128,32,5,5) # SOURCE out_chan,in_chan,y,x
 icp7_out2_biases = NDA("icp7_out2_biases",128) # SOURCE out_chan
 icp7_out2 = NDA("icp7_out2",num_img,128,14,14) # num,chan,y,x
 icp7_reduction2_one_row_per_patch_buf = NDA("icp7_reduction2_one_row_per_patch_buf",196,800)
 for i in range(0,num_img):
  patches_to_rows( src=icp7_reduction2[i,:,:,:], dest=icp7_reduction2_one_row_per_patch_buf, in_pad="2 2 2 2",stride="1 1" ) # one copy per output elem
  icp7_out2 = icp7_reduction2_one_row_per_patch_buf * transpose(reshape(icp7_out2_filts,128,800)) # sgemm: MxNxK == 196x800x128
 ReLU(name="relu_icp7_out2",in_place=[icp7_out2])
 icp7_pool = NDA("icp7_pool",num_img,528,14,14) # num,chan,y,x
 Pooling(name="icp7_pool",bots=[ icp6_out ],tops=[ icp7_pool ],
 	in_pad="1 1 1 1",stride="1 1")
 icp7_out3_filts = NDA("icp7_out3_filts",128,528,1,1) # SOURCE out_chan,in_chan,y,x
 icp7_out3_biases = NDA("icp7_out3_biases",128) # SOURCE out_chan
 icp7_out3 = NDA("icp7_out3",num_img,128,14,14) # num,chan,y,x
 icp7_pool_one_row_per_patch_buf = NDA("icp7_pool_one_row_per_patch_buf",196,528)
 for i in range(0,num_img):
  patches_to_rows( src=icp7_pool[i,:,:,:], dest=icp7_pool_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp7_out3 = icp7_pool_one_row_per_patch_buf * transpose(reshape(icp7_out3_filts,128,528)) # sgemm: MxNxK == 196x528x128
 ReLU(name="relu_icp7_out3",in_place=[icp7_out3])
 icp7_out0_filts = NDA("icp7_out0_filts",256,528,1,1) # SOURCE out_chan,in_chan,y,x
 icp7_out0_biases = NDA("icp7_out0_biases",256) # SOURCE out_chan
 icp7_out0 = NDA("icp7_out0",num_img,256,14,14) # num,chan,y,x
 icp6_out_one_row_per_patch_buf = NDA("icp6_out_one_row_per_patch_buf",196,528)
 for i in range(0,num_img):
  patches_to_rows( src=icp6_out[i,:,:,:], dest=icp6_out_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp7_out0 = icp6_out_one_row_per_patch_buf * transpose(reshape(icp7_out0_filts,256,528)) # sgemm: MxNxK == 196x528x256
 ReLU(name="relu_icp7_out0",in_place=[icp7_out0])
 icp7_out = NDA("icp7_out",num_img,832,14,14) # num,chan,y,x
 Concat(name="icp7_out",bots=[ icp7_out0, icp7_out1, icp7_out2, icp7_out3 ],tops=[ icp7_out ],
 	in_pad="0 0 0 0",stride="1 1")
 icp8_in = NDA("icp8_in",num_img,832,7,7) # num,chan,y,x
 Pooling(name="icp8_in",bots=[ icp7_out ],tops=[ icp8_in ],
 	in_pad="0 0 0 0",stride="2 2")
 icp8_reduction1_filts = NDA("icp8_reduction1_filts",160,832,1,1) # SOURCE out_chan,in_chan,y,x
 icp8_reduction1_biases = NDA("icp8_reduction1_biases",160) # SOURCE out_chan
 icp8_reduction1 = NDA("icp8_reduction1",num_img,160,7,7) # num,chan,y,x
 icp8_in_one_row_per_patch_buf = NDA("icp8_in_one_row_per_patch_buf",49,832)
 for i in range(0,num_img):
  patches_to_rows( src=icp8_in[i,:,:,:], dest=icp8_in_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp8_reduction1 = icp8_in_one_row_per_patch_buf * transpose(reshape(icp8_reduction1_filts,160,832)) # sgemm: MxNxK == 49x832x160
 ReLU(name="relu_icp8_reduction1",in_place=[icp8_reduction1])
 icp8_out1_filts = NDA("icp8_out1_filts",320,160,3,3) # SOURCE out_chan,in_chan,y,x
 icp8_out1_biases = NDA("icp8_out1_biases",320) # SOURCE out_chan
 icp8_out1 = NDA("icp8_out1",num_img,320,7,7) # num,chan,y,x
 icp8_reduction1_one_row_per_patch_buf = NDA("icp8_reduction1_one_row_per_patch_buf",49,1440)
 for i in range(0,num_img):
  patches_to_rows( src=icp8_reduction1[i,:,:,:], dest=icp8_reduction1_one_row_per_patch_buf, in_pad="1 1 1 1",stride="1 1" ) # one copy per output elem
  icp8_out1 = icp8_reduction1_one_row_per_patch_buf * transpose(reshape(icp8_out1_filts,320,1440)) # sgemm: MxNxK == 49x1440x320
 ReLU(name="relu_icp8_out1",in_place=[icp8_out1])
 icp8_reduction2_filts = NDA("icp8_reduction2_filts",32,832,1,1) # SOURCE out_chan,in_chan,y,x
 icp8_reduction2_biases = NDA("icp8_reduction2_biases",32) # SOURCE out_chan
 icp8_reduction2 = NDA("icp8_reduction2",num_img,32,7,7) # num,chan,y,x
 icp8_in_one_row_per_patch_buf = NDA("icp8_in_one_row_per_patch_buf",49,832)
 for i in range(0,num_img):
  patches_to_rows( src=icp8_in[i,:,:,:], dest=icp8_in_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp8_reduction2 = icp8_in_one_row_per_patch_buf * transpose(reshape(icp8_reduction2_filts,32,832)) # sgemm: MxNxK == 49x832x32
 ReLU(name="relu_icp8_reduction2",in_place=[icp8_reduction2])
 icp8_out2_filts = NDA("icp8_out2_filts",128,32,5,5) # SOURCE out_chan,in_chan,y,x
 icp8_out2_biases = NDA("icp8_out2_biases",128) # SOURCE out_chan
 icp8_out2 = NDA("icp8_out2",num_img,128,7,7) # num,chan,y,x
 icp8_reduction2_one_row_per_patch_buf = NDA("icp8_reduction2_one_row_per_patch_buf",49,800)
 for i in range(0,num_img):
  patches_to_rows( src=icp8_reduction2[i,:,:,:], dest=icp8_reduction2_one_row_per_patch_buf, in_pad="2 2 2 2",stride="1 1" ) # one copy per output elem
  icp8_out2 = icp8_reduction2_one_row_per_patch_buf * transpose(reshape(icp8_out2_filts,128,800)) # sgemm: MxNxK == 49x800x128
 ReLU(name="relu_icp8_out2",in_place=[icp8_out2])
 icp8_pool = NDA("icp8_pool",num_img,832,7,7) # num,chan,y,x
 Pooling(name="icp8_pool",bots=[ icp8_in ],tops=[ icp8_pool ],
 	in_pad="1 1 1 1",stride="1 1")
 icp8_out3_filts = NDA("icp8_out3_filts",128,832,1,1) # SOURCE out_chan,in_chan,y,x
 icp8_out3_biases = NDA("icp8_out3_biases",128) # SOURCE out_chan
 icp8_out3 = NDA("icp8_out3",num_img,128,7,7) # num,chan,y,x
 icp8_pool_one_row_per_patch_buf = NDA("icp8_pool_one_row_per_patch_buf",49,832)
 for i in range(0,num_img):
  patches_to_rows( src=icp8_pool[i,:,:,:], dest=icp8_pool_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp8_out3 = icp8_pool_one_row_per_patch_buf * transpose(reshape(icp8_out3_filts,128,832)) # sgemm: MxNxK == 49x832x128
 ReLU(name="relu_icp8_out3",in_place=[icp8_out3])
 icp8_out0_filts = NDA("icp8_out0_filts",256,832,1,1) # SOURCE out_chan,in_chan,y,x
 icp8_out0_biases = NDA("icp8_out0_biases",256) # SOURCE out_chan
 icp8_out0 = NDA("icp8_out0",num_img,256,7,7) # num,chan,y,x
 icp8_in_one_row_per_patch_buf = NDA("icp8_in_one_row_per_patch_buf",49,832)
 for i in range(0,num_img):
  patches_to_rows( src=icp8_in[i,:,:,:], dest=icp8_in_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp8_out0 = icp8_in_one_row_per_patch_buf * transpose(reshape(icp8_out0_filts,256,832)) # sgemm: MxNxK == 49x832x256
 ReLU(name="relu_icp8_out0",in_place=[icp8_out0])
 icp8_out = NDA("icp8_out",num_img,832,7,7) # num,chan,y,x
 Concat(name="icp8_out",bots=[ icp8_out0, icp8_out1, icp8_out2, icp8_out3 ],tops=[ icp8_out ],
 	in_pad="0 0 0 0",stride="1 1")
 icp9_reduction1_filts = NDA("icp9_reduction1_filts",192,832,1,1) # SOURCE out_chan,in_chan,y,x
 icp9_reduction1_biases = NDA("icp9_reduction1_biases",192) # SOURCE out_chan
 icp9_reduction1 = NDA("icp9_reduction1",num_img,192,7,7) # num,chan,y,x
 icp8_out_one_row_per_patch_buf = NDA("icp8_out_one_row_per_patch_buf",49,832)
 for i in range(0,num_img):
  patches_to_rows( src=icp8_out[i,:,:,:], dest=icp8_out_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp9_reduction1 = icp8_out_one_row_per_patch_buf * transpose(reshape(icp9_reduction1_filts,192,832)) # sgemm: MxNxK == 49x832x192
 ReLU(name="relu_icp9_reduction1",in_place=[icp9_reduction1])
 icp9_out1_filts = NDA("icp9_out1_filts",384,192,3,3) # SOURCE out_chan,in_chan,y,x
 icp9_out1_biases = NDA("icp9_out1_biases",384) # SOURCE out_chan
 icp9_out1 = NDA("icp9_out1",num_img,384,7,7) # num,chan,y,x
 icp9_reduction1_one_row_per_patch_buf = NDA("icp9_reduction1_one_row_per_patch_buf",49,1728)
 for i in range(0,num_img):
  patches_to_rows( src=icp9_reduction1[i,:,:,:], dest=icp9_reduction1_one_row_per_patch_buf, in_pad="1 1 1 1",stride="1 1" ) # one copy per output elem
  icp9_out1 = icp9_reduction1_one_row_per_patch_buf * transpose(reshape(icp9_out1_filts,384,1728)) # sgemm: MxNxK == 49x1728x384
 ReLU(name="relu_icp9_out1",in_place=[icp9_out1])
 icp9_reduction2_filts = NDA("icp9_reduction2_filts",48,832,1,1) # SOURCE out_chan,in_chan,y,x
 icp9_reduction2_biases = NDA("icp9_reduction2_biases",48) # SOURCE out_chan
 icp9_reduction2 = NDA("icp9_reduction2",num_img,48,7,7) # num,chan,y,x
 icp8_out_one_row_per_patch_buf = NDA("icp8_out_one_row_per_patch_buf",49,832)
 for i in range(0,num_img):
  patches_to_rows( src=icp8_out[i,:,:,:], dest=icp8_out_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp9_reduction2 = icp8_out_one_row_per_patch_buf * transpose(reshape(icp9_reduction2_filts,48,832)) # sgemm: MxNxK == 49x832x48
 ReLU(name="relu_icp9_reduction2",in_place=[icp9_reduction2])
 icp9_out2_filts = NDA("icp9_out2_filts",128,48,5,5) # SOURCE out_chan,in_chan,y,x
 icp9_out2_biases = NDA("icp9_out2_biases",128) # SOURCE out_chan
 icp9_out2 = NDA("icp9_out2",num_img,128,7,7) # num,chan,y,x
 icp9_reduction2_one_row_per_patch_buf = NDA("icp9_reduction2_one_row_per_patch_buf",49,1200)
 for i in range(0,num_img):
  patches_to_rows( src=icp9_reduction2[i,:,:,:], dest=icp9_reduction2_one_row_per_patch_buf, in_pad="2 2 2 2",stride="1 1" ) # one copy per output elem
  icp9_out2 = icp9_reduction2_one_row_per_patch_buf * transpose(reshape(icp9_out2_filts,128,1200)) # sgemm: MxNxK == 49x1200x128
 ReLU(name="relu_icp9_out2",in_place=[icp9_out2])
 icp9_pool = NDA("icp9_pool",num_img,832,7,7) # num,chan,y,x
 Pooling(name="icp9_pool",bots=[ icp8_out ],tops=[ icp9_pool ],
 	in_pad="1 1 1 1",stride="1 1")
 icp9_out3_filts = NDA("icp9_out3_filts",128,832,1,1) # SOURCE out_chan,in_chan,y,x
 icp9_out3_biases = NDA("icp9_out3_biases",128) # SOURCE out_chan
 icp9_out3 = NDA("icp9_out3",num_img,128,7,7) # num,chan,y,x
 icp9_pool_one_row_per_patch_buf = NDA("icp9_pool_one_row_per_patch_buf",49,832)
 for i in range(0,num_img):
  patches_to_rows( src=icp9_pool[i,:,:,:], dest=icp9_pool_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp9_out3 = icp9_pool_one_row_per_patch_buf * transpose(reshape(icp9_out3_filts,128,832)) # sgemm: MxNxK == 49x832x128
 ReLU(name="relu_icp9_out3",in_place=[icp9_out3])
 icp9_out0_filts = NDA("icp9_out0_filts",384,832,1,1) # SOURCE out_chan,in_chan,y,x
 icp9_out0_biases = NDA("icp9_out0_biases",384) # SOURCE out_chan
 icp9_out0 = NDA("icp9_out0",num_img,384,7,7) # num,chan,y,x
 icp8_out_one_row_per_patch_buf = NDA("icp8_out_one_row_per_patch_buf",49,832)
 for i in range(0,num_img):
  patches_to_rows( src=icp8_out[i,:,:,:], dest=icp8_out_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  icp9_out0 = icp8_out_one_row_per_patch_buf * transpose(reshape(icp9_out0_filts,384,832)) # sgemm: MxNxK == 49x832x384
 ReLU(name="relu_icp9_out0",in_place=[icp9_out0])
 icp9_out = NDA("icp9_out",num_img,1024,7,7) # num,chan,y,x
 Concat(name="icp9_out",bots=[ icp9_out0, icp9_out1, icp9_out2, icp9_out3 ],tops=[ icp9_out ],
 	in_pad="0 0 0 0",stride="1 1")
 cls3_pool = NDA("cls3_pool",num_img,1024,1,1) # num,chan,y,x
 Pooling(name="cls3_pool",bots=[ icp9_out ],tops=[ cls3_pool ],
 	in_pad="0 0 0 0",stride="1 1")
 Dropout(name="cls3_drop",in_place=[cls3_pool])
 cls3_fc__filts = NDA("cls3_fc__filts",33,1024,1,1) # SOURCE out_chan,in_chan,y,x
 cls3_fc__biases = NDA("cls3_fc__biases",33) # SOURCE out_chan
 cls3_fc_ = NDA("cls3_fc_",num_img,33,1,1) # SINK num,chan,y,x
 cls3_pool_one_row_per_patch_buf = NDA("cls3_pool_one_row_per_patch_buf",1,1024)
 for i in range(0,num_img):
  patches_to_rows( src=cls3_pool[i,:,:,:], dest=cls3_pool_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
  cls3_fc_ = cls3_pool_one_row_per_patch_buf * transpose(reshape(cls3_fc__filts,33,1024)) # sgemm: MxNxK == 1x1024x33
diff --git a/googlenet_ft_fl_train_val.prototxt b/googlenet_ft_fl_train_val.prototxt
 name: "GoogLeNet"
 layers {
  name: "data"
  type: DATA
  top: "data"
  top: "label"
  data_param {
    #source: "train_lmdb"
    #source: "/scratch/forresti/ilsvrc2012/ilsvrc2012_train_256x256_lmdb"
    #source: "/nscratch/forresti/FlickrLogos-32/FlickrLogos-32_trainval_lmdb"
    #source: "/nscratch/forresti/FlickrLogos-32/FlickrLogos-32_trainval_logosonly_lmdb"
    source: "/scratch/datasets/imagenet_classification/ilsvrc12_train_lmdb"

    backend: LMDB
    batch_size: 32 
  }
  transform_param {
    crop_size: 224
    #mean_file: "network5_mean.binaryproto"
    #mean_file: "/home/eecs/forresti/caffe_depthMax_and_hog/data/ilsvrc12/imagenet_mean.binaryproto"
    mean_file: "/scratch/datasets/data/ilsvrc12/imagenet_mean.binaryproto"
    mirror: true
  }
  include: { phase: TRAIN }
 }
 layers {
  name: "data"
  type: DATA
  top: "data"
  top: "label"
  data_param {
    #source: "test_lmdb"
    #source: "/scratch/forresti/ilsvrc2012/ilsvrc2012_val_256x256_lmdb"
    #source: "/nscratch/forresti/FlickrLogos-32/FlickrLogos-32_test_lmdb"
    #source: "/nscratch/forresti/FlickrLogos-32/FlickrLogos-32_test_logosonly_lmdb"
    source: "/scratch/datasets/imagenet_classification/ilsvrc12_train_lmdb"
    backend: LMDB
    #batch_size: 32
    batch_size: 25
  }
  transform_param {
    crop_size: 224
    #mean_file: "network5_mean.binaryproto"
    #mean_file: "/home/eecs/forresti/caffe_depthMax_and_hog/data/ilsvrc12/imagenet_mean.binaryproto"
    mean_file: "/scratch/datasets/data/ilsvrc12/imagenet_mean.binaryproto"
    mirror: false
  }
  include: { phase: TEST }
 }
 layers {
  name: "conv1"
  type: CONVOLUTION
  bottom: "data"
  top: "conv1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 64
    kernel_size: 7
    stride: 2
    pad: 3
    weight_filler {
      type: "gaussian"
      std: 0.015
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu1"
  type: RELU
  bottom: "conv1"
  top: "conv1"
 }
 layers {
  name: "pool1"
  type: POOLING
  bottom: "conv1"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
    pad: 0
  }
 }
 layers {
  name: "norm1"
  type: LRN
  bottom: "pool1"
  top: "norm1"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
 }
 layers {
  name: "reduction2"
  type: CONVOLUTION
  bottom: "norm1"
  top: "reduction2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 64
    pad: 0
    kernel_size: 1
    group: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_reduction2"
  type: RELU
  bottom: "reduction2"
  top: "reduction2"
 }
 layers {
  name: "conv2"
  type: CONVOLUTION
  bottom: "reduction2"
  top: "conv2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 192
    pad: 1
    kernel_size: 3
    group: 1
    weight_filler {
      type: "gaussian"
      std: 0.02
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu2"
  type: RELU
  bottom: "conv2"
  top: "conv2"
 }
 layers {
  name: "norm2"
  type: LRN
  bottom: "conv2"
  top: "norm2"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
 }
 layers {
  name: "pool2"
  type: POOLING
  bottom: "norm2"
  top: "pool2"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
    pad: 0
  }
 }
 # Inception module 1 ***************
 layers {
  name: "icp1_reduction1"
  type: CONVOLUTION
  bottom: "pool2"
  top: "icp1_reduction1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 96
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp1_reduction1"
  type: RELU
  bottom: "icp1_reduction1"
  top: "icp1_reduction1"
 }
 layers {
  name: "icp1_reduction2"
  type: CONVOLUTION
  bottom: "pool2"
  top: "icp1_reduction2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 16
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp1_reduction2"
  type: RELU
  bottom: "icp1_reduction2"
  top: "icp1_reduction2"
 }
 layers {
  name: "icp1_pool"
  type: POOLING
  bottom: "pool2"
  top: "icp1_pool"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 1
    pad: 1
  }
 }
 # ***********
 layers {
  name: "icp1_out0"
  type: CONVOLUTION
  bottom: "pool2"
  top: "icp1_out0"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 64
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp1_out0"
  type: RELU
  bottom: "icp1_out0"
  top: "icp1_out0"
 }
 layers {
  name: "icp1_out1"
  type: CONVOLUTION
  bottom: "icp1_reduction1"
  top: "icp1_out1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 128
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.04
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp1_out1"
  type: RELU
  bottom: "icp1_out1"
  top: "icp1_out1"
 }
 layers {
  name: "icp1_out2"
  type: CONVOLUTION
  bottom: "icp1_reduction2"
  top: "icp1_out2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 32
    pad: 2
    kernel_size: 5
    weight_filler {
      type: "gaussian"
      std: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp1_out2"
  type: RELU
  bottom: "icp1_out2"
  top: "icp1_out2"
 }
 layers {
  name: "icp1_out3"
  type: CONVOLUTION
  bottom: "icp1_pool"
  top: "icp1_out3"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 32
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp1_out3"
  type: RELU
  bottom: "icp1_out3"
  top: "icp1_out3"
 }
 # Concat them together
 layers {
  name: "icp2_in"
  type: CONCAT
  bottom: "icp1_out0"
  bottom: "icp1_out1"
  bottom: "icp1_out2"
  bottom: "icp1_out3"
  top: "icp2_in"
 }

 # Inception module 2 ***************
 layers {
  name: "icp2_reduction1"
  type: CONVOLUTION
  bottom: "icp2_in"
  top: "icp2_reduction1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 128
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp2_reduction1"
  type: RELU
  bottom: "icp2_reduction1"
  top: "icp2_reduction1"
 }
 layers {
  name: "icp2_reduction2"
  type: CONVOLUTION
  bottom: "icp2_in"
  top: "icp2_reduction2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 32
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp2_reduction2"
  type: RELU
  bottom: "icp2_reduction2"
  top: "icp2_reduction2"
 }
 layers {
  name: "icp2_pool"
  type: POOLING
  bottom: "icp2_in"
  top: "icp2_pool"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 1
    pad: 1
  }
 }
 # ***********
 layers {
  name: "icp2_out0"
  type: CONVOLUTION
  bottom: "icp2_in"
  top: "icp2_out0"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 128
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp2_out0"
  type: RELU
  bottom: "icp2_out0"
  top: "icp2_out0"
 }
 layers {
  name: "icp2_out1"
  type: CONVOLUTION
  bottom: "icp2_reduction1"
  top: "icp2_out1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 192
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.04
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp2_out1"
  type: RELU
  bottom: "icp2_out1"
  top: "icp2_out1"
 }
 layers {
  name: "icp2_out2"
  type: CONVOLUTION
  bottom: "icp2_reduction2"
  top: "icp2_out2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 96
    pad: 2
    kernel_size: 5
    weight_filler {
      type: "gaussian"
      std: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp2_out2"
  type: RELU
  bottom: "icp2_out2"
  top: "icp2_out2"
 }
 layers {
  name: "icp2_out3"
  type: CONVOLUTION
  bottom: "icp2_pool"
  top: "icp2_out3"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 64
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp2_out3"
  type: RELU
  bottom: "icp2_out3"
  top: "icp2_out3"
 }
 # Concat them together
 layers {
  name: "icp2_out"
  type: CONCAT
  bottom: "icp2_out0"
  bottom: "icp2_out1"
  bottom: "icp2_out2"
  bottom: "icp2_out3"
  top: "icp2_out"
 }
 layers {
  name: "icp3_in"
  type: POOLING
  bottom: "icp2_out"
  top: "icp3_in"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
    pad: 0
  }
 }

 # Inception module 3 ***************
 layers {
  name: "icp3_reduction1"
  type: CONVOLUTION
  bottom: "icp3_in"
  top: "icp3_reduction1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 96
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp3_reduction1"
  type: RELU
  bottom: "icp3_reduction1"
  top: "icp3_reduction1"
 }
 layers {
  name: "icp3_reduction2"
  type: CONVOLUTION
  bottom: "icp3_in"
  top: "icp3_reduction2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 16
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp3_reduction2"
  type: RELU
  bottom: "icp3_reduction2"
  top: "icp3_reduction2"
 }
 layers {
  name: "icp3_pool"
  type: POOLING
  bottom: "icp3_in"
  top: "icp3_pool"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 1
    pad: 1
  }
 }
 # ***********
 layers {
  name: "icp3_out0"
  type: CONVOLUTION
  bottom: "icp3_in"
  top: "icp3_out0"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 192
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp3_out0"
  type: RELU
  bottom: "icp3_out0"
  top: "icp3_out0"
 }
 layers {
  name: "icp3_out1"
  type: CONVOLUTION
  bottom: "icp3_reduction1"
  top: "icp3_out1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 208
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.04
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp3_out1"
  type: RELU
  bottom: "icp3_out1"
  top: "icp3_out1"
 }
 layers {
  name: "icp3_out2"
  type: CONVOLUTION
  bottom: "icp3_reduction2"
  top: "icp3_out2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 48
    pad: 2
    kernel_size: 5
    weight_filler {
      type: "gaussian"
      std: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp3_out2"
  type: RELU
  bottom: "icp3_out2"
  top: "icp3_out2"
 }
 layers {
  name: "icp3_out3"
  type: CONVOLUTION
  bottom: "icp3_pool"
  top: "icp3_out3"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 64
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp3_out3"
  type: RELU
  bottom: "icp3_out3"
  top: "icp3_out3"
 }
 # Concat them together
 layers {
  name: "icp3_out"
  type: CONCAT
  bottom: "icp3_out0"
  bottom: "icp3_out1"
  bottom: "icp3_out2"
  bottom: "icp3_out3"
  top: "icp3_out"
 }

 # first classification branch ************
 layers {
  name: "cls1_pool"
  type: POOLING
  bottom: "icp3_out"
  top: "cls1_pool"
  pooling_param {
    pool: AVE
    kernel_size: 5
    stride: 3
    pad: 0
    # this padding is somewhat special
  }
 }
 layers {
  name: "cls1_reduction"
  type: CONVOLUTION
  bottom: "cls1_pool"
  top: "cls1_reduction"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 128
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_cls1_reduction"
  type: RELU
  bottom: "cls1_reduction"
  top: "cls1_reduction"
 }
 layers {
  name: "cls1_fc1"
  type: INNER_PRODUCT
  bottom: "cls1_reduction"
  top: "cls1_fc1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  inner_product_param {
    num_output: 1024
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_cls1_fc1"
  type: RELU
  bottom: "cls1_fc1"
  top: "cls1_fc1"
 }
 layers {
  name: "cls1_drop"
  type: DROPOUT
  bottom: "cls1_fc1"
  top: "cls1_fc1"
  dropout_param {
    dropout_ratio: 0.7
  }
 }
 layers {
  name: "cls1_fc2_"
  type: INNER_PRODUCT
  bottom: "cls1_fc1"
  top: "cls1_fc2_"
  #blobs_lr: 1
  #blobs_lr: 2
  # blobs_lr is set to higher than for other layers, because this layer is starting from random while the others are already trained
  blobs_lr: 10
  blobs_lr: 20
  weight_decay: 1
  weight_decay: 0
  inner_product_param {
    #num_output: 1000
    num_output: 33
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "loss1"
  type: SOFTMAX_LOSS
  bottom: "cls1_fc2_"
  bottom: "label"
  top: "loss1"
  loss_weight: 1
 }

 # Inception module 4 ***************
 layers {
  name: "icp4_reduction1"
  type: CONVOLUTION
  bottom: "icp3_out"
  top: "icp4_reduction1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 112
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp4_reduction1"
  type: RELU
  bottom: "icp4_reduction1"
  top: "icp4_reduction1"
 }
 layers {
  name: "icp4_reduction2"
  type: CONVOLUTION
  bottom: "icp3_out"
  top: "icp4_reduction2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 24
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp4_reduction2"
  type: RELU
  bottom: "icp4_reduction2"
  top: "icp4_reduction2"
 }
 layers {
  name: "icp4_pool"
  type: POOLING
  bottom: "icp3_out"
  top: "icp4_pool"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 1
    pad: 1
  }
 }
 # ***********
 layers {
  name: "icp4_out0"
  type: CONVOLUTION
  bottom: "icp3_out"
  top: "icp4_out0"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 160
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp4_out0"
  type: RELU
  bottom: "icp4_out0"
  top: "icp4_out0"
 }
 layers {
  name: "icp4_out1"
  type: CONVOLUTION
  bottom: "icp4_reduction1"
  top: "icp4_out1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 224
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.04
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp4_out1"
  type: RELU
  bottom: "icp4_out1"
  top: "icp4_out1"
 }
 layers {
  name: "icp4_out2"
  type: CONVOLUTION
  bottom: "icp4_reduction2"
  top: "icp4_out2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 64
    pad: 2
    kernel_size: 5
    weight_filler {
      type: "gaussian"
      std: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp4_out2"
  type: RELU
  bottom: "icp4_out2"
  top: "icp4_out2"
 }
 layers {
  name: "icp4_out3"
  type: CONVOLUTION
  bottom: "icp4_pool"
  top: "icp4_out3"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 64
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp4_out3"
  type: RELU
  bottom: "icp4_out3"
  top: "icp4_out3"
 }
 # Concat them together
 layers {
  name: "icp4_out"
  type: CONCAT
  bottom: "icp4_out0"
  bottom: "icp4_out1"
  bottom: "icp4_out2"
  bottom: "icp4_out3"
  top: "icp4_out"
 }

 # Inception module 5 ***************
 layers {
  name: "icp5_reduction1"
  type: CONVOLUTION
  bottom: "icp4_out"
  top: "icp5_reduction1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 128
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp5_reduction1"
  type: RELU
  bottom: "icp5_reduction1"
  top: "icp5_reduction1"
 }
 layers {
  name: "icp5_reduction2"
  type: CONVOLUTION
  bottom: "icp4_out"
  top: "icp5_reduction2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 24
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp5_reduction2"
  type: RELU
  bottom: "icp5_reduction2"
  top: "icp5_reduction2"
 }
 layers {
  name: "icp5_pool"
  type: POOLING
  bottom: "icp4_out"
  top: "icp5_pool"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 1
    pad: 1
  }
 }
 # ***********
 layers {
  name: "icp5_out0"
  type: CONVOLUTION
  bottom: "icp4_out"
  top: "icp5_out0"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 128
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp5_out0"
  type: RELU
  bottom: "icp5_out0"
  top: "icp5_out0"
 }
 layers {
  name: "icp5_out1"
  type: CONVOLUTION
  bottom: "icp5_reduction1"
  top: "icp5_out1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.04
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp5_out1"
  type: RELU
  bottom: "icp5_out1"
  top: "icp5_out1"
 }
 layers {
  name: "icp5_out2"
  type: CONVOLUTION
  bottom: "icp5_reduction2"
  top: "icp5_out2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 64
    pad: 2
    kernel_size: 5
    weight_filler {
      type: "gaussian"
      std: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp5_out2"
  type: RELU
  bottom: "icp5_out2"
  top: "icp5_out2"
 }
 layers {
  name: "icp5_out3"
  type: CONVOLUTION
  bottom: "icp5_pool"
  top: "icp5_out3"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 64
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp5_out3"
  type: RELU
  bottom: "icp5_out3"
  top: "icp5_out3"
 }
 # Concat them together
 layers {
  name: "icp5_out"
  type: CONCAT
  bottom: "icp5_out0"
  bottom: "icp5_out1"
  bottom: "icp5_out2"
  bottom: "icp5_out3"
  top: "icp5_out"
 }

 # Inception module 6 ***************
 layers {
  name: "icp6_reduction1"
  type: CONVOLUTION
  bottom: "icp5_out"
  top: "icp6_reduction1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 144
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp6_reduction1"
  type: RELU
  bottom: "icp6_reduction1"
  top: "icp6_reduction1"
 }
 layers {
  name: "icp6_reduction2"
  type: CONVOLUTION
  bottom: "icp5_out"
  top: "icp6_reduction2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 32
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp6_reduction2"
  type: RELU
  bottom: "icp6_reduction2"
  top: "icp6_reduction2"
 }
 layers {
  name: "icp6_pool"
  type: POOLING
  bottom: "icp5_out"
  top: "icp6_pool"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 1
    pad: 1
  }
 }
 # ***********
 layers {
  name: "icp6_out0"
  type: CONVOLUTION
  bottom: "icp5_out"
  top: "icp6_out0"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 112
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp6_out0"
  type: RELU
  bottom: "icp6_out0"
  top: "icp6_out0"
 }
 layers {
  name: "icp6_out1"
  type: CONVOLUTION
  bottom: "icp6_reduction1"
  top: "icp6_out1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 288
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.04
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp6_out1"
  type: RELU
  bottom: "icp6_out1"
  top: "icp6_out1"
 }
 layers {
  name: "icp6_out2"
  type: CONVOLUTION
  bottom: "icp6_reduction2"
  top: "icp6_out2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 64
    pad: 2
    kernel_size: 5
    weight_filler {
      type: "gaussian"
      std: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp6_out2"
  type: RELU
  bottom: "icp6_out2"
  top: "icp6_out2"
 }
 layers {
  name: "icp6_out3"
  type: CONVOLUTION
  bottom: "icp6_pool"
  top: "icp6_out3"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 64
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp6_out3"
  type: RELU
  bottom: "icp6_out3"
  top: "icp6_out3"
 }
 # Concat them together
 layers {
  name: "icp6_out"
  type: CONCAT
  bottom: "icp6_out0"
  bottom: "icp6_out1"
  bottom: "icp6_out2"
  bottom: "icp6_out3"
  top: "icp6_out"
 }

 # second classification branch ************
 layers {
  name: "cls2_pool"
  type: POOLING
  bottom: "icp6_out"
  top: "cls2_pool"
  pooling_param {
    pool: AVE
    kernel_size: 5
    stride: 3
    pad: 0
    # this padding is somewhat special
  }
 }
 layers {
  name: "cls2_reduction"
  type: CONVOLUTION
  bottom: "cls2_pool"
  top: "cls2_reduction"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 128
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_cls2_reduction"
  type: RELU
  bottom: "cls2_reduction"
  top: "cls2_reduction"
 }
 layers {
  name: "cls2_fc1"
  type: INNER_PRODUCT
  bottom: "cls2_reduction"
  top: "cls2_fc1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  inner_product_param {
    num_output: 1024
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_cls2_fc1"
  type: RELU
  bottom: "cls2_fc1"
  top: "cls2_fc1"
 }
 layers {
  name: "cls2_drop"
  type: DROPOUT
  bottom: "cls2_fc1"
  top: "cls2_fc1"
  dropout_param {
    dropout_ratio: 0.7
  }
 }
 layers {
  name: "cls2_fc2_"
  type: INNER_PRODUCT
  bottom: "cls2_fc1"
  top: "cls2_fc2_"
  #blobs_lr: 1
  #blobs_lr: 2
  # blobs_lr is set to higher than for other layers, because this layer is starting from random while the others are already trained
  blobs_lr: 10
  blobs_lr: 20
  weight_decay: 1
  weight_decay: 0
  inner_product_param {
    #num_output: 1000
    num_output: 33
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "loss2"
  type: SOFTMAX_LOSS
  bottom: "cls2_fc2_"
  bottom: "label"
  top: "loss2"
  loss_weight: 1
 }

 # Inception module 7 ***************
 layers {
  name: "icp7_reduction1"
  type: CONVOLUTION
  bottom: "icp6_out"
  top: "icp7_reduction1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 160
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp7_reduction1"
  type: RELU
  bottom: "icp7_reduction1"
  top: "icp7_reduction1"
 }
 layers {
  name: "icp7_reduction2"
  type: CONVOLUTION
  bottom: "icp6_out"
  top: "icp7_reduction2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 32
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp7_reduction2"
  type: RELU
  bottom: "icp7_reduction2"
  top: "icp7_reduction2"
 }
 layers {
  name: "icp7_pool"
  type: POOLING
  bottom: "icp6_out"
  top: "icp7_pool"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 1
    pad: 1
  }
 }
 # ***********
 layers {
  name: "icp7_out0"
  type: CONVOLUTION
  bottom: "icp6_out"
  top: "icp7_out0"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 256
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp7_out0"
  type: RELU
  bottom: "icp7_out0"
  top: "icp7_out0"
 }
 layers {
  name: "icp7_out1"
  type: CONVOLUTION
  bottom: "icp7_reduction1"
  top: "icp7_out1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 320
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.04
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp7_out1"
  type: RELU
  bottom: "icp7_out1"
  top: "icp7_out1"
 }
 layers {
  name: "icp7_out2"
  type: CONVOLUTION
  bottom: "icp7_reduction2"
  top: "icp7_out2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 128
    pad: 2
    kernel_size: 5
    weight_filler {
      type: "gaussian"
      std: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp7_out2"
  type: RELU
  bottom: "icp7_out2"
  top: "icp7_out2"
 }
 layers {
  name: "icp7_out3"
  type: CONVOLUTION
  bottom: "icp7_pool"
  top: "icp7_out3"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 128
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp7_out3"
  type: RELU
  bottom: "icp7_out3"
  top: "icp7_out3"
 }
 # Concat them together
 layers {
  name: "icp7_out"
  type: CONCAT
  bottom: "icp7_out0"
  bottom: "icp7_out1"
  bottom: "icp7_out2"
  bottom: "icp7_out3"
  top: "icp7_out"
 }
 layers {
  name: "icp8_in"
  type: POOLING
  bottom: "icp7_out"
  top: "icp8_in"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
    pad: 0
  }
 }
 # Inception module 8 ***************
 layers {
  name: "icp8_reduction1"
  type: CONVOLUTION
  bottom: "icp8_in"
  top: "icp8_reduction1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 160
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp8_reduction1"
  type: RELU
  bottom: "icp8_reduction1"
  top: "icp8_reduction1"
 }
 layers {
  name: "icp8_reduction2"
  type: CONVOLUTION
  bottom: "icp8_in"
  top: "icp8_reduction2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 32
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp8_reduction2"
  type: RELU
  bottom: "icp8_reduction2"
  top: "icp8_reduction2"
 }
 layers {
  name: "icp8_pool"
  type: POOLING
  bottom: "icp8_in"
  top: "icp8_pool"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 1
    pad: 1
  }
 }
 # ***********
 layers {
  name: "icp8_out0"
  type: CONVOLUTION
  bottom: "icp8_in"
  top: "icp8_out0"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 256
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp8_out0"
  type: RELU
  bottom: "icp8_out0"
  top: "icp8_out0"
 }
 layers {
  name: "icp8_out1"
  type: CONVOLUTION
  bottom: "icp8_reduction1"
  top: "icp8_out1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 320
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.04
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp8_out1"
  type: RELU
  bottom: "icp8_out1"
  top: "icp8_out1"
 }
 layers {
  name: "icp8_out2"
  type: CONVOLUTION
  bottom: "icp8_reduction2"
  top: "icp8_out2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 128
    pad: 2
    kernel_size: 5
    weight_filler {
      type: "gaussian"
      std: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp8_out2"
  type: RELU
  bottom: "icp8_out2"
  top: "icp8_out2"
 }
 layers {
  name: "icp8_out3"
  type: CONVOLUTION
  bottom: "icp8_pool"
  top: "icp8_out3"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 128
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp8_out3"
  type: RELU
  bottom: "icp8_out3"
  top: "icp8_out3"
 }
 # Concat them together
 layers {
  name: "icp8_out"
  type: CONCAT
  bottom: "icp8_out0"
  bottom: "icp8_out1"
  bottom: "icp8_out2"
  bottom: "icp8_out3"
  top: "icp8_out"
 }

 # Inception module 9 ***************
 layers {
  name: "icp9_reduction1"
  type: CONVOLUTION
  bottom: "icp8_out"
  top: "icp9_reduction1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 192
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp9_reduction1"
  type: RELU
  bottom: "icp9_reduction1"
  top: "icp9_reduction1"
 }
 layers {
  name: "icp9_reduction2"
  type: CONVOLUTION
  bottom: "icp8_out"
  top: "icp9_reduction2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 48
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp9_reduction2"
  type: RELU
  bottom: "icp9_reduction2"
  top: "icp9_reduction2"
 }
 layers {
  name: "icp9_pool"
  type: POOLING
  bottom: "icp8_out"
  top: "icp9_pool"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 1
    pad: 1
  }
 }
 # ***********
 layers {
  name: "icp9_out0"
  type: CONVOLUTION
  bottom: "icp8_out"
  top: "icp9_out0"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 384
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp9_out0"
  type: RELU
  bottom: "icp9_out0"
  top: "icp9_out0"
 }
 layers {
  name: "icp9_out1"
  type: CONVOLUTION
  bottom: "icp9_reduction1"
  top: "icp9_out1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.04
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp9_out1"
  type: RELU
  bottom: "icp9_out1"
  top: "icp9_out1"
 }
 layers {
  name: "icp9_out2"
  type: CONVOLUTION
  bottom: "icp9_reduction2"
  top: "icp9_out2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 128
    pad: 2
    kernel_size: 5
    weight_filler {
      type: "gaussian"
      std: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp9_out2"
  type: RELU
  bottom: "icp9_out2"
  top: "icp9_out2"
 }
 layers {
  name: "icp9_out3"
  type: CONVOLUTION
  bottom: "icp9_pool"
  top: "icp9_out3"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 128
    pad: 0
    kernel_size: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "relu_icp9_out3"
  type: RELU
  bottom: "icp9_out3"
  top: "icp9_out3"
 }
 # Concat them together
 layers {
  name: "icp9_out"
  type: CONCAT
  bottom: "icp9_out0"
  bottom: "icp9_out1"
  bottom: "icp9_out2"
  bottom: "icp9_out3"
  top: "icp9_out"
 }

 # third classification branch
 layers {
  name: "cls3_pool"
  type: POOLING
  bottom: "icp9_out"
  top: "cls3_pool"
  pooling_param {
    pool: AVE
    kernel_size: 7
    stride: 1
    pad: 0
    # This padding is somewhat special
  }
 }
 layers {
  name: "cls3_drop"
  type: DROPOUT
  bottom: "cls3_pool"
  top: "cls3_pool"
  dropout_param {
    dropout_ratio: 0.4
  }
 }
 layers {
  name: "cls3_fc_"
  type: INNER_PRODUCT
  bottom: "cls3_pool"
  top: "cls3_fc_"
  #blobs_lr: 1
  #blobs_lr: 2
  # blobs_lr is set to higher than for other layers, because this layer is starting from random while the others are already trained
  blobs_lr: 10
  blobs_lr: 20
  weight_decay: 1
  weight_decay: 0
  inner_product_param {
    num_output: 33
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
 }
 layers {
  name: "loss3"
  type: SOFTMAX_LOSS
  bottom: "cls3_fc_"
  bottom: "label"
  top: "loss3"
  loss_weight: 1
 }
 layers {
  name: "accuracy1"
  type: ACCURACY
  bottom: "cls3_fc_"
  bottom: "label"
  top: "accuracy1"
  include: { phase: TEST }
 }
 layers {
  name: "accuracy5"
  type: ACCURACY
  bottom: "cls3_fc_"
  bottom: "label"
  top: "accuracy5"
  include: { phase: TEST }
  accuracy_param {
    top_k: 5
  }
 }
diff --git a/update_cmd.bash b/update_cmd.bash
 # from up one dir from gist (i.e. run/tr1)
 boda cnet_ana --in-model=alexnet_ng_conv --in-sz=227 --print-ops=1 --print-ops-fn=alexnet_ng_conv.py && cp alexnet_ng_conv.py ex_gist/alexnet_ng_conv.txt
 boda cnet_ana --in-model=googlenet_ft_fl --in-sz=224 --print-ops=1 --print-ops-fn=googlenet_ft_fl.py && cp googlenet_ft_fl.py ex_gist/googlenet.txt
 boda cnet_ana --in-model=alexnet_ng_conv --in-sz=227 --print-ops=1 --expand-ops=1 --print-ops-fn=alexnet_ng_conv.sgemm.py && cp alexnet_ng_conv.sgemm.py ex_gist
 boda cnet_ana --in-model=googlenet_ft_fl --in-sz=224 --print-ops=1 --expand-ops=1 --print-ops-fn=googlenet_ft_fl.sgemm.py && cp googlenet_ft_fl.sgemm.py ex_gist
	data = NDA("data",num_img,3,227,227) # SOURCE num,chan,y,x
	conv1_filts = NDA("conv1_filts",96,3,11,11) # SOURCE out_chan,in_chan,y,x
	conv1_biases = NDA("conv1_biases",96) # SOURCE out_chan
	conv1 = NDA("conv1",num_img,96,55,55) # num,chan,y,x
	data_one_row_per_patch_buf = NDA("data_one_row_per_patch_buf",3025,363)
	for i in range(0,num_img):
	patches_to_rows( src=data[i,:,:,:], dest=data_one_row_per_patch_buf, in_pad="0 0 0 0",stride="4 4" ) # one copy per output elem
	conv1 = data_one_row_per_patch_buf * transpose(reshape(conv1_filts,96,363)) # sgemm: MxNxK == 3025x363x96
	ReLU(name="relu1",in_place=[conv1])
	norm1 = NDA("norm1",num_img,96,55,55) # num,chan,y,x
	LRN(name="norm1",bots=[ conv1 ],tops=[ norm1 ],
	in_pad="0 0 0 0",stride="1 1")
	pool1 = NDA("pool1",num_img,96,27,27) # num,chan,y,x
	Pooling(name="pool1",bots=[ norm1 ],tops=[ pool1 ],
	in_pad="0 0 0 0",stride="2 2")
	conv2_filts = NDA("conv2_filts",256,96,5,5) # SOURCE out_chan,in_chan,y,x
	conv2_biases = NDA("conv2_biases",256) # SOURCE out_chan
	conv2 = NDA("conv2",num_img,256,27,27) # num,chan,y,x
	pool1_one_row_per_patch_buf = NDA("pool1_one_row_per_patch_buf",729,2400)
	for i in range(0,num_img):
	patches_to_rows( src=pool1[i,:,:,:], dest=pool1_one_row_per_patch_buf, in_pad="2 2 2 2",stride="1 1" ) # one copy per output elem
	conv2 = pool1_one_row_per_patch_buf * transpose(reshape(conv2_filts,256,2400)) # sgemm: MxNxK == 729x2400x256
	ReLU(name="relu2",in_place=[conv2])
	norm2 = NDA("norm2",num_img,256,27,27) # num,chan,y,x
	LRN(name="norm2",bots=[ conv2 ],tops=[ norm2 ],
	in_pad="0 0 0 0",stride="1 1")
	pool2 = NDA("pool2",num_img,256,13,13) # num,chan,y,x
	Pooling(name="pool2",bots=[ norm2 ],tops=[ pool2 ],
	in_pad="0 0 0 0",stride="2 2")
	conv3_filts = NDA("conv3_filts",384,256,3,3) # SOURCE out_chan,in_chan,y,x
	conv3_biases = NDA("conv3_biases",384) # SOURCE out_chan
	conv3 = NDA("conv3",num_img,384,13,13) # num,chan,y,x
	pool2_one_row_per_patch_buf = NDA("pool2_one_row_per_patch_buf",169,2304)
	for i in range(0,num_img):
	patches_to_rows( src=pool2[i,:,:,:], dest=pool2_one_row_per_patch_buf, in_pad="1 1 1 1",stride="1 1" ) # one copy per output elem
	conv3 = pool2_one_row_per_patch_buf * transpose(reshape(conv3_filts,384,2304)) # sgemm: MxNxK == 169x2304x384
	ReLU(name="relu3",in_place=[conv3])
	conv4_filts = NDA("conv4_filts",384,384,3,3) # SOURCE out_chan,in_chan,y,x
	conv4_biases = NDA("conv4_biases",384) # SOURCE out_chan
	conv4 = NDA("conv4",num_img,384,13,13) # num,chan,y,x
	conv3_one_row_per_patch_buf = NDA("conv3_one_row_per_patch_buf",169,3456)
	for i in range(0,num_img):
	patches_to_rows( src=conv3[i,:,:,:], dest=conv3_one_row_per_patch_buf, in_pad="1 1 1 1",stride="1 1" ) # one copy per output elem
	conv4 = conv3_one_row_per_patch_buf * transpose(reshape(conv4_filts,384,3456)) # sgemm: MxNxK == 169x3456x384
	ReLU(name="relu4",in_place=[conv4])
	conv5_filts = NDA("conv5_filts",256,384,3,3) # SOURCE out_chan,in_chan,y,x
	conv5_biases = NDA("conv5_biases",256) # SOURCE out_chan
	conv5 = NDA("conv5",num_img,256,13,13) # num,chan,y,x
	conv4_one_row_per_patch_buf = NDA("conv4_one_row_per_patch_buf",169,3456)
	for i in range(0,num_img):
	patches_to_rows( src=conv4[i,:,:,:], dest=conv4_one_row_per_patch_buf, in_pad="1 1 1 1",stride="1 1" ) # one copy per output elem
	conv5 = conv4_one_row_per_patch_buf * transpose(reshape(conv5_filts,256,3456)) # sgemm: MxNxK == 169x3456x256
	ReLU(name="relu5",in_place=[conv5])
	pool5 = NDA("pool5",num_img,256,6,6) # num,chan,y,x
	Pooling(name="pool5",bots=[ conv5 ],tops=[ pool5 ],
	in_pad="0 0 0 0",stride="2 2")
	fc6_conv_filts = NDA("fc6_conv_filts",4096,256,6,6) # SOURCE out_chan,in_chan,y,x
	fc6_conv_biases = NDA("fc6_conv_biases",4096) # SOURCE out_chan
	fc6 = NDA("fc6",num_img,4096,1,1) # num,chan,y,x
	pool5_one_row_per_patch_buf = NDA("pool5_one_row_per_patch_buf",1,9216)
	for i in range(0,num_img):
	patches_to_rows( src=pool5[i,:,:,:], dest=pool5_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
	fc6 = pool5_one_row_per_patch_buf * transpose(reshape(fc6_conv_filts,4096,9216)) # sgemm: MxNxK == 1x9216x4096
	ReLU(name="relu6",in_place=[fc6])
	Dropout(name="drop6",in_place=[fc6])
	fc7_conv_filts = NDA("fc7_conv_filts",4096,4096,1,1) # SOURCE out_chan,in_chan,y,x
	fc7_conv_biases = NDA("fc7_conv_biases",4096) # SOURCE out_chan
	fc7 = NDA("fc7",num_img,4096,1,1) # num,chan,y,x
	fc6_one_row_per_patch_buf = NDA("fc6_one_row_per_patch_buf",1,4096)
	for i in range(0,num_img):
	patches_to_rows( src=fc6[i,:,:,:], dest=fc6_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
	fc7 = fc6_one_row_per_patch_buf * transpose(reshape(fc7_conv_filts,4096,4096)) # sgemm: MxNxK == 1x4096x4096
	ReLU(name="relu7",in_place=[fc7])
	Dropout(name="drop7",in_place=[fc7])
	fc8_conv_filts = NDA("fc8_conv_filts",1000,4096,1,1) # SOURCE out_chan,in_chan,y,x
	fc8_conv_biases = NDA("fc8_conv_biases",1000) # SOURCE out_chan
	fc8 = NDA("fc8",num_img,1000,1,1) # SINK num,chan,y,x
	fc7_one_row_per_patch_buf = NDA("fc7_one_row_per_patch_buf",1,4096)
	for i in range(0,num_img):
	patches_to_rows( src=fc7[i,:,:,:], dest=fc7_one_row_per_patch_buf, in_pad="0 0 0 0",stride="1 1" ) # one copy per output elem
	fc8 = fc7_one_row_per_patch_buf * transpose(reshape(fc8_conv_filts,1000,4096)) # sgemm: MxNxK == 1x4096x1000
	name: "AlexNet"
	layer {
	name: "data"
	type: "Data"
	top: "data"
	top: "label"
	include {
	phase: TRAIN
	}
	transform_param {
	mirror: true
	crop_size: 227
	mean_file: "/scratch/datasets/data/ilsvrc12/imagenet_mean.binaryproto"
	}
	data_param {
	source: "/scratch/datasets/imagenet_classification/ilsvrc12_train_lmdb"
	batch_size: 256
	backend: LMDB
	}
	}
	layer {
	name: "data"
	type: "Data"
	top: "data"
	top: "label"
	include {
	phase: TEST
	}
	transform_param {
	mirror: false
	crop_size: 227
	mean_file: "/scratch/datasets/data/ilsvrc12/imagenet_mean.binaryproto"
	}
	data_param {
	source: "/scratch/datasets/imagenet_classification/ilsvrc12_val_lmdb"
	batch_size: 50
	backend: LMDB
	}
	}
	layer {
	name: "conv1"
	type: "Convolution"
	bottom: "data"
	top: "conv1"
	param {
	lr_mult: 1
	decay_mult: 1
	}
	param {
	lr_mult: 2
	decay_mult: 0
	}
	convolution_param {
	num_output: 96
	kernel_size: 11
	stride: 4
	weight_filler {
	type: "gaussian"
	std: 0.01
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	}
	}
	layer {
	name: "relu1"
	type: "ReLU"
	bottom: "conv1"
	top: "conv1"
	}
	layer {
	name: "norm1"
	type: "LRN"
	bottom: "conv1"
	top: "norm1"
	lrn_param {
	local_size: 5
	alpha: 0.0001
	beta: 0.75
	}
	}
	layer {
	name: "pool1"
	type: "Pooling"
	bottom: "norm1"
	top: "pool1"
	pooling_param {
	pool: MAX
	kernel_size: 3
	stride: 2
	}
	}
	layer {
	name: "conv2"
	type: "Convolution"
	bottom: "pool1"
	top: "conv2"
	param {
	lr_mult: 1
	decay_mult: 1
	}
	param {
	lr_mult: 2
	decay_mult: 0
	}
	convolution_param {
	num_output: 256
	pad: 2
	kernel_size: 5
	weight_filler {
	type: "gaussian"
	std: 0.01
	}
	bias_filler {
	type: "constant"
	value: 0.1
	}
	}
	}
	layer {
	name: "relu2"
	type: "ReLU"
	bottom: "conv2"
	top: "conv2"
	}
	layer {
	name: "norm2"
	type: "LRN"
	bottom: "conv2"
	top: "norm2"
	lrn_param {
	local_size: 5
	alpha: 0.0001
	beta: 0.75
	}
	}
	layer {
	name: "pool2"
	type: "Pooling"
	bottom: "norm2"
	top: "pool2"
	pooling_param {
	pool: MAX
	kernel_size: 3
	stride: 2
	}
	}
	layer {
	name: "conv3"
	type: "Convolution"
	bottom: "pool2"
	top: "conv3"
	param {
	lr_mult: 1
	decay_mult: 1
	}
	param {
	lr_mult: 2
	decay_mult: 0
	}
	convolution_param {
	num_output: 384
	pad: 1
	kernel_size: 3
	weight_filler {
	type: "gaussian"
	std: 0.01
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	}
	}
	layer {
	name: "relu3"
	type: "ReLU"
	bottom: "conv3"
	top: "conv3"
	}
	layer {
	name: "conv4"
	type: "Convolution"
	bottom: "conv3"
	top: "conv4"
	param {
	lr_mult: 1
	decay_mult: 1
	}
	param {
	lr_mult: 2
	decay_mult: 0
	}
	convolution_param {
	num_output: 384
	pad: 1
	kernel_size: 3
	weight_filler {
	type: "gaussian"
	std: 0.01
	}
	bias_filler {
	type: "constant"
	value: 0.1
	}
	}
	}
	layer {
	name: "relu4"
	type: "ReLU"
	bottom: "conv4"
	top: "conv4"
	}
	layer {
	name: "conv5"
	type: "Convolution"
	bottom: "conv4"
	top: "conv5"
	param {
	lr_mult: 1
	decay_mult: 1
	}
	param {
	lr_mult: 2
	decay_mult: 0
	}
	convolution_param {
	num_output: 256
	pad: 1
	kernel_size: 3
	weight_filler {
	type: "gaussian"
	std: 0.01
	}
	bias_filler {
	type: "constant"
	value: 0.1
	}
	}
	}
	layer {
	name: "relu5"
	type: "ReLU"
	bottom: "conv5"
	top: "conv5"
	}
	layer {
	name: "pool5"
	type: "Pooling"
	bottom: "conv5"
	top: "pool5"
	pooling_param {
	pool: MAX
	kernel_size: 3
	stride: 2
	}
	}
	layer {
	name: "fc6-conv"
	type: "Convolution"
	bottom: "pool5"
	top: "fc6"
	param {
	lr_mult: 1
	decay_mult: 1
	}
	param {
	lr_mult: 2
	decay_mult: 0
	}
	convolution_param {
	num_output: 4096
	kernel_size: 6
	weight_filler {
	type: "gaussian"
	std: 0.005
	}
	bias_filler {
	type: "constant"
	value: 0.1
	}
	}
	}
	layer {
	name: "relu6"
	type: "ReLU"
	bottom: "fc6"
	top: "fc6"
	}
	layer {
	name: "drop6"
	type: "Dropout"
	bottom: "fc6"
	top: "fc6"
	dropout_param {
	dropout_ratio: 0.5
	}
	}
	layer {
	name: "fc7-conv"
	type: "Convolution"
	bottom: "fc6"
	top: "fc7"
	param {
	lr_mult: 1
	decay_mult: 1
	}
	param {
	lr_mult: 2
	decay_mult: 0
	}
	convolution_param {
	num_output: 4096
	kernel_size: 1
	weight_filler {
	type: "gaussian"
	std: 0.005
	}
	bias_filler {
	type: "constant"
	value: 0.1
	}
	}
	}
	layer {
	name: "relu7"
	type: "ReLU"
	bottom: "fc7"
	top: "fc7"
	}
	layer {
	name: "drop7"
	type: "Dropout"
	bottom: "fc7"
	top: "fc7"
	dropout_param {
	dropout_ratio: 0.5
	}
	}
	layer {
	name: "fc8-conv"
	type: "Convolution"
	bottom: "fc7"
	top: "fc8"
	param {
	lr_mult: 1
	decay_mult: 1
	}
	param {
	lr_mult: 2
	decay_mult: 0
	}
	convolution_param {
	num_output: 1000
	kernel_size: 1
	weight_filler {
	type: "gaussian"
	std: 0.01
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	}
	}
	layer {
	name: "accuracy"
	type: "Accuracy"
	bottom: "fc8"
	bottom: "label"
	top: "accuracy"
	include {
	phase: TEST
	}
	}
	layer {
	name: "loss"
	type: "SoftmaxWithLoss"
	bottom: "fc8"
	bottom: "label"
	top: "loss"
	}
	# from up one dir from gist (i.e. run/tr1)
	boda cnet_ana --in-model=alexnet_ng_conv --in-sz=227 --print-ops=1 --print-ops-fn=alexnet_ng_conv.py && cp alexnet_ng_conv.py ex_gist/alexnet_ng_conv.txt
	boda cnet_ana --in-model=googlenet_ft_fl --in-sz=224 --print-ops=1 --print-ops-fn=googlenet_ft_fl.py && cp googlenet_ft_fl.py ex_gist/googlenet.txt
	boda cnet_ana --in-model=alexnet_ng_conv --in-sz=227 --print-ops=1 --expand-ops=1 --print-ops-fn=alexnet_ng_conv.sgemm.py && cp alexnet_ng_conv.sgemm.py ex_gist
	boda cnet_ana --in-model=googlenet_ft_fl --in-sz=224 --print-ops=1 --expand-ops=1 --print-ops-fn=googlenet_ft_fl.sgemm.py && cp googlenet_ft_fl.sgemm.py ex_gist