zomux · August 29, 2015 14:21
diff --git a/prof.7.txt b/prof.7.txt
 Function profiling
 ==================
  Message: /home/hadoop/deepy/deepy/trainers/trainers.py:282
  Time in 131 calls to Function.__call__: 1.497694e+01s
  Time in Function.fn.__call__: 1.494386e+01s (99.779%)
  Time in thunks: 1.420090e+01s (94.818%)
  Total compile time: 6.766920e-01s
    Number of Apply nodes: 167
    Theano Optimizer time: 3.270850e-01s
       Theano validate time: 2.164602e-03s
    Theano Linker time (includes C, CUDA code generation/compiling): 2.244329e-01s
       Import time 7.803679e-02s
 
 Time in all call to theano.grad() 1.627803e-02s
 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  69.7%    69.7%       9.892s       1.08e-03s     C     9170      76   theano.tensor.elemwise.Elemwise
  13.9%    83.6%       1.974s       1.88e-03s     C     1048       8   theano.tensor.blas.Dot22
  13.2%    96.7%       1.872s       4.76e-03s     C      393       3   theano.tensor.blas.Gemm
   1.4%    98.1%       0.193s       2.45e-04s     C      786       6   theano.tensor.elemwise.CAReduce
   0.8%    98.9%       0.108s       3.43e-05s     Py    3144      12   theano.ifelse.IfElse
   0.7%    99.5%       0.095s       4.85e-05s     C     1965      15   theano.tensor.elemwise.Sum
   0.1%    99.7%       0.021s       1.14e-05s     C     1834      14   theano.tensor.elemwise.DimShuffle
   0.1%    99.8%       0.016s       2.02e-05s     C      786       6   theano.tensor.subtensor.Subtensor
   0.1%    99.8%       0.007s       5.70e-05s     C      131       7   theano.tensor.basic.Alloc
   0.0%    99.9%       0.006s       1.50e-05s     C      393       3   theano.tensor.basic.AllocEmpty
   0.0%    99.9%       0.004s       3.25e-05s     C      131       1   theano.tensor.basic.MaxAndArgmax
   0.0%    99.9%       0.003s       2.12e-05s     Py     131       1   theano.tensor.subtensor.AdvancedSubtensor
   0.0%    99.9%       0.003s       2.41e-06s     C     1048       8   theano.compile.ops.Shape_i
   0.0%   100.0%       0.002s       1.84e-05s     Py     131       1   theano.tensor.subtensor.AdvancedIncSubtensor
   0.0%   100.0%       0.002s       1.52e-05s     Py     131       1   theano.tensor.basic.ARange
   0.0%   100.0%       0.001s       9.66e-06s     C      131       1   theano.tensor.nnet.nnet.SoftmaxWithBias
   0.0%   100.0%       0.001s       2.97e-06s     C      393       3   theano.tensor.opt.MakeVector
   0.0%   100.0%       0.001s       3.90e-06s     C      131       1   theano.tensor.nnet.nnet.SoftmaxGrad
   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)
 
 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  15.5%    15.5%       2.206s       5.61e-03s     C      393        3   Elemwise{Composite{(i0 + (i1 * i2))}}
  15.5%    31.0%       2.195s       2.79e-03s     C      786        6   Elemwise{Composite{sqr(Abs(i0))}}
  13.9%    44.9%       1.974s       1.88e-03s     C     1048        8   Dot22
  13.2%    58.1%       1.872s       4.76e-03s     C      393        3   Gemm{no_inplace}
  10.5%    68.6%       1.489s       1.26e-03s     C     1179        9   Elemwise{add,no_inplace}
  10.2%    78.8%       1.450s       1.85e-03s     C      786        6   Elemwise{Composite{((i0 * i1) - (i2 * i3))}}
   7.5%    86.3%       1.063s       2.03e-03s     C      524        4   Elemwise{mul}
   5.3%    91.6%       0.752s       2.87e-03s     C      262        2   Elemwise{gt,no_inplace}
   5.0%    96.6%       0.715s       2.73e-03s     C      262        2   Elemwise{Composite{Abs((i0 * i1))}}
   1.4%    97.9%       0.193s       2.45e-04s     C      786        6   Reduce{maximum}
   0.8%    98.7%       0.108s       3.43e-05s     Py    3144       12   if{}
   0.6%    99.3%       0.091s       6.33e-05s     C     1441       11   Sum{acc_dtype=float64}
   0.1%    99.5%       0.016s       2.02e-05s     C      786        6   Subtensor{int64}
   0.1%    99.6%       0.016s       2.39e-05s     C      655        5   DimShuffle{1,0}
   0.1%    99.6%       0.007s       5.70e-05s     C      131        7   Alloc
   0.0%    99.7%       0.006s       1.50e-05s     C      393        3   AllocEmpty{dtype='float32'}
   0.0%    99.7%       0.006s       1.10e-05s     C      524        4   Elemwise{Composite{((i0 / i1) / i2)}}
   0.0%    99.7%       0.004s       3.25e-05s     C      131        1   MaxAndArgmax
   0.0%    99.8%       0.004s       9.22e-06s     C      393        3   Sum{axis=[0], acc_dtype=float64}
   0.0%    99.8%       0.003s       2.12e-05s     Py     131        1   AdvancedSubtensor
   ... (remaining 25 Ops account for   0.22%(0.03s) of the runtime)
 
 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Mflops> <Gflops/s> <Apply name>
   7.3%     7.3%       1.030s       7.86e-03s    131   114                     Elemwise{Composite{sqr(Abs(i0))}}(Dot22.0)
    input 0: dtype=float32, shape=(784, 256), strides=c 
    output 0: dtype=float32, shape=(784, 256), strides=c 
   7.2%    14.5%       1.026s       7.83e-03s    131   154                     Elemwise{Composite{(i0 + (i1 * i2))}}(Gemm{no_inplace}.0, TensorConstant{(1, 1) of 0.0002}, W_dense1)
    input 0: dtype=float32, shape=(784, 256), strides=c 
    input 1: dtype=float32, shape=(1, 1), strides=c 
    input 2: dtype=float32, shape=(784, 256), strides=c 
    output 0: dtype=float32, shape=(784, 256), strides=c 
   6.8%    21.3%       0.966s       7.38e-03s    131   140                     Elemwise{Composite{(i0 + (i1 * i2))}}(Gemm{no_inplace}.0, TensorConstant{(1, 1) of 0.0002}, W_dense2)
    input 0: dtype=float32, shape=(256, 256), strides=c 
    input 1: dtype=float32, shape=(1, 1), strides=c 
    input 2: dtype=float32, shape=(256, 256), strides=c 
    output 0: dtype=float32, shape=(256, 256), strides=c 
   6.7%    28.0%       0.947s       7.23e-03s    131   148                     Gemm{no_inplace}(AllocEmpty{dtype='float32'}.0, if{}.0, x.T, Elemwise{mul}.0, TensorConstant{0.0})
    input 0: dtype=float32, shape=(784, 256), strides=c 
    input 1: dtype=float32, shape=(), strides=c 
    input 2: dtype=float32, shape=(784, 20), strides=(4, 3136) 
    input 3: dtype=float32, shape=(20, 256), strides=c 
    input 4: dtype=float32, shape=(), strides=c 
    output 0: dtype=float32, shape=(784, 256), strides=c 
   6.5%    34.4%       0.921s       7.03e-03s    131   132                     Gemm{no_inplace}(AllocEmpty{dtype='float32'}.0, if{}.0, DimShuffle{1,0}.0, Elemwise{mul}.0, TensorConstant{0.0})
    input 0: dtype=float32, shape=(256, 256), strides=c 
    input 1: dtype=float32, shape=(), strides=c 
    input 2: dtype=float32, shape=(256, 20), strides=(4, 1024) 
    input 3: dtype=float32, shape=(20, 256), strides=c 
    input 4: dtype=float32, shape=(), strides=c 
    output 0: dtype=float32, shape=(256, 256), strides=c 
   6.5%    40.9%       0.921s       7.03e-03s    131    99                     Elemwise{Composite{sqr(Abs(i0))}}(Dot22.0)
    input 0: dtype=float32, shape=(256, 256), strides=c 
    output 0: dtype=float32, shape=(256, 256), strides=c 
   6.1%    47.0%       0.861s       6.57e-03s    131    94                     Dot22(DimShuffle{1,0}.0, Elemwise{mul}.0)
    input 0: dtype=float32, shape=(256, 20), strides=(4, 1024) 
    input 1: dtype=float32, shape=(20, 256), strides=c 
    output 0: dtype=float32, shape=(256, 256), strides=c 
   5.7%    52.7%       0.812s       6.20e-03s    131   108                     Dot22(x.T, Elemwise{mul}.0)
    input 0: dtype=float32, shape=(784, 20), strides=(4, 3136) 
    input 1: dtype=float32, shape=(20, 256), strides=c 
    output 0: dtype=float32, shape=(784, 256), strides=c 
   5.5%    58.2%       0.780s       5.95e-03s    131   166                     Elemwise{Composite{((i0 * i1) - (i2 * i3))}}(TensorConstant{(1, 1) of 0.9}, W_dense1_vel, DimShuffle{x,x}.0, if{}.0)
    input 0: dtype=float32, shape=(1, 1), strides=c 
    input 1: dtype=float32, shape=(784, 256), strides=c 
    input 2: dtype=float32, shape=(1, 1), strides=c 
    input 3: dtype=float32, shape=(784, 256), strides=c 
    output 0: dtype=float32, shape=(784, 256), strides=c 
   4.5%    62.7%       0.638s       4.87e-03s    131    18                     Elemwise{add,no_inplace}(W_dense3, W_dense3_vel)
    input 0: dtype=float32, shape=(256, 10), strides=c 
    input 1: dtype=float32, shape=(256, 10), strides=c 
    output 0: dtype=float32, shape=(256, 10), strides=c 
   3.9%    66.6%       0.554s       4.23e-03s    131   160                     Elemwise{Composite{((i0 * i1) - (i2 * i3))}}(TensorConstant{(1, 1) of 0.9}, W_dense2_vel, DimShuffle{x,x}.0, if{}.0)
    input 0: dtype=float32, shape=(1, 1), strides=c 
    input 1: dtype=float32, shape=(256, 256), strides=c 
    input 2: dtype=float32, shape=(1, 1), strides=c 
    input 3: dtype=float32, shape=(256, 256), strides=c 
    output 0: dtype=float32, shape=(256, 256), strides=c 
   3.7%    70.3%       0.525s       4.01e-03s    131    39                     Elemwise{gt,no_inplace}(Elemwise{add,no_inplace}.0, TensorConstant{(1, 1) of 0})
    input 0: dtype=float32, shape=(20, 256), strides=c 
    input 1: dtype=int8, shape=(1, 1), strides=c 
    output 0: dtype=int8, shape=(20, 256), strides=c 
   3.6%    73.9%       0.510s       3.90e-03s    131    20                     Elemwise{add,no_inplace}(W_dense2, W_dense2_vel)
    input 0: dtype=float32, shape=(256, 256), strides=c 
    input 1: dtype=float32, shape=(256, 256), strides=c 
    output 0: dtype=float32, shape=(256, 256), strides=c 
   3.1%    77.0%       0.445s       3.40e-03s    131   100                     Elemwise{mul}(Dot22.0, Elemwise{gt,no_inplace}.0)
    input 0: dtype=float32, shape=(20, 256), strides=c 
    input 1: dtype=int8, shape=(20, 256), strides=c 
    output 0: dtype=float32, shape=(20, 256), strides=c 
   2.6%    79.6%       0.373s       2.85e-03s    131    44                     Elemwise{mul,no_inplace}(Elemwise{add,no_inplace}.0, Elemwise{gt,no_inplace}.0)
    input 0: dtype=float32, shape=(20, 256), strides=c 
    input 1: dtype=int8, shape=(20, 256), strides=c 
    output 0: dtype=float32, shape=(20, 256), strides=c 
   2.6%    82.2%       0.364s       2.78e-03s    131    45                     Elemwise{Composite{Abs((i0 * i1))}}(Elemwise{add,no_inplace}.0, Elemwise{gt,no_inplace}.0)
    input 0: dtype=float32, shape=(20, 256), strides=c 
    input 1: dtype=int8, shape=(20, 256), strides=c 
    output 0: dtype=float32, shape=(20, 256), strides=c 
   2.5%    84.7%       0.351s       2.68e-03s    131    61                     Elemwise{Composite{Abs((i0 * i1))}}(Elemwise{add,no_inplace}.0, Elemwise{gt,no_inplace}.0)
    input 0: dtype=float32, shape=(20, 256), strides=c 
    input 1: dtype=int8, shape=(20, 256), strides=c 
    output 0: dtype=float32, shape=(20, 256), strides=c 
   2.3%    87.0%       0.326s       2.49e-03s    131    22                     Elemwise{add,no_inplace}(W_dense1, W_dense1_vel)
    input 0: dtype=float32, shape=(784, 256), strides=c 
    input 1: dtype=float32, shape=(784, 256), strides=c 
    output 0: dtype=float32, shape=(784, 256), strides=c 
   1.7%    88.7%       0.242s       1.85e-03s    131    88                     Elemwise{Composite{sqr(Abs(i0))}}(Dot22.0)
    input 0: dtype=float32, shape=(256, 10), strides=c 
    output 0: dtype=float32, shape=(256, 10), strides=c 
   1.6%    90.3%       0.232s       1.77e-03s    131    10                     Dot22(x, W_dense1)
    input 0: dtype=float32, shape=(20, 784), strides=c 
    input 1: dtype=float32, shape=(784, 256), strides=c 
    output 0: dtype=float32, shape=(20, 256), strides=c 
   ... (remaining 147 Apply instances account for 9.69%(1.38s) of the runtime)
 
 Memory Profile
 (Sparse variables are ignored)
 (For values in brackets, it's for linker = c|py
 ---
    Max if no gc (allow_gc=False): 10093KB (9041KB)
    CPU: 10093KB (9041KB)
    GPU: 0KB (0KB)
 ---
    Max if linker=cvm(default): 2754KB (3815KB)
    CPU: 2754KB (3815KB)
    GPU: 0KB (0KB)
 ---
    Memory saved if views are used: 0KB (0KB)
    Memory saved if inplace ops are used: 0KB (0KB)
    Memory saved if gc is enabled: 7339KB (5225KB)
 ---
 
    <Sum apply outputs (bytes)> <Apply outputs shape> <created/inplace/view> <Apply node>
 
        802816B  [(784, 256)] c Elemwise{add,no_inplace}(W_dense1, W_dense1_vel)
        802816B  [(784, 256)] c Elemwise{Composite{(i0 + (i1 * i2))}}(Gemm{no_inplace}.0, TensorConstant{(1, 1) of 0.0002}, W_dense1)
        802816B  [(784, 256)] c Elemwise{Composite{((i0 * i1) - (i2 * i3))}}(TensorConstant{(1, 1) of 0.9}, W_dense1_vel, DimShuffle{x,x}.0, if{}.0)
        802816B  [(784, 256)] c AllocEmpty{dtype='float32'}(Shape_i{1}.0, Shape_i{0}.0)
        802816B  [(784, 256)] c Gemm{no_inplace}(AllocEmpty{dtype='float32'}.0, if{}.0, x.T, Elemwise{mul}.0, TensorConstant{0.0})
        802816B  [(784, 256)] c Elemwise{Composite{sqr(Abs(i0))}}(Dot22.0)
        802816B  [(784, 256)] c Dot22(x.T, Elemwise{mul}.0)
        802816B  [(784, 256)] c if{}(Elemwise{isnan,no_inplace}.0, Alloc.0, Elemwise{Composite{(i0 + (i1 * i2))}}.0)
        262144B  [(256, 256)] c Dot22(DimShuffle{1,0}.0, Elemwise{mul}.0)
        262144B  [(256, 256)] c Elemwise{add,no_inplace}(W_dense2, W_dense2_vel)
        262144B  [(256, 256)] c Elemwise{Composite{sqr(Abs(i0))}}(Dot22.0)
        262144B  [(256, 256)] c DimShuffle{1,0}(W_dense2)
        262144B  [(256, 256)] c Elemwise{Composite{((i0 * i1) - (i2 * i3))}}(TensorConstant{(1, 1) of 0.9}, W_dense2_vel, DimShuffle{x,x}.0, if{}.0)
        262144B  [(256, 256)] c AllocEmpty{dtype='float32'}(Shape_i{1}.0, Shape_i{0}.0)
        262144B  [(256, 256)] c Gemm{no_inplace}(AllocEmpty{dtype='float32'}.0, if{}.0, DimShuffle{1,0}.0, Elemwise{mul}.0, TensorConstant{0.0})
        262144B  [(256, 256)] c if{}(Elemwise{isnan,no_inplace}.0, Alloc.0, Elemwise{Composite{(i0 + (i1 * i2))}}.0)
        262144B  [(256, 256)] c Elemwise{Composite{(i0 + (i1 * i2))}}(Gemm{no_inplace}.0, TensorConstant{(1, 1) of 0.0002}, W_dense2)
         62720B  [(784, 20)] c DimShuffle{1,0}(x)
         20480B  [(20, 256)] c Elemwise{mul}(Dot22.0, Elemwise{gt,no_inplace}.0)
         20480B  [(20, 256)] c Dot22(x, W_dense1)
   ... (remaining 147 Apply account for 372432B/9257936B ((4.02%)) of the Apply with dense outputs sizes)
 
    <created/inplace/view> is taken from the Op's declaration.
    Apply nodes marked 'inplace' or 'view' may actually allocate memory, this is not reported here. If you use DebugMode, warnings will be emitted in those cases.
 
 Function profiling
 ==================
  Message: Sum of all(2) printed profiles at exit excluding Scan op profile.
  Time in 1631 calls to Function.__call__: 1.601294e+01s
  Time in Function.fn.__call__: 1.594616e+01s (99.583%)
  Time in thunks: 1.486443e+01s (92.828%)
  Total compile time: 1.662645e+00s
    Number of Apply nodes: 21
    Theano Optimizer time: 4.918830e-01s
       Theano validate time: 2.593994e-03s
    Theano Linker time (includes C, CUDA code generation/compiling): 2.589970e-01s
       Import time 1.021464e-01s
 
 Time in all call to theano.grad() 1.627803e-02s
 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  67.1%    67.1%       9.974s       5.07e-04s     C    19670      83   theano.tensor.elemwise.Elemwise
  16.7%    83.8%       2.487s       4.48e-04s     C     5548      11   theano.tensor.blas.Dot22
  12.6%    96.4%       1.872s       4.76e-03s     C      393       3   theano.tensor.blas.Gemm
   1.3%    97.7%       0.193s       2.45e-04s     C      786       6   theano.tensor.elemwise.CAReduce
   0.7%    98.5%       0.108s       3.43e-05s     Py    3144      12   theano.ifelse.IfElse
   0.7%    99.1%       0.101s       2.03e-05s     C     4965      17   theano.tensor.elemwise.Sum
   0.2%    99.3%       0.032s       5.06e-06s     C     6334      17   theano.tensor.elemwise.DimShuffle
   0.2%    99.5%       0.023s       1.39e-05s     C     1631       2   theano.tensor.basic.MaxAndArgmax
   0.1%    99.6%       0.016s       2.02e-05s     C      786       6   theano.tensor.subtensor.Subtensor
   0.1%    99.7%       0.014s       8.85e-06s     Py    1631       2   theano.tensor.subtensor.AdvancedSubtensor
   0.1%    99.8%       0.011s       6.76e-06s     Py    1631       2   theano.tensor.basic.ARange
   0.1%    99.8%       0.009s       5.53e-06s     C     1631       2   theano.tensor.nnet.nnet.SoftmaxWithBias
   0.1%    99.9%       0.007s       5.70e-05s     C      131       7   theano.tensor.basic.Alloc
   0.0%    99.9%       0.007s       1.68e-06s     C     4048      10   theano.compile.ops.Shape_i
   0.0%   100.0%       0.006s       1.50e-05s     C      393       3   theano.tensor.basic.AllocEmpty
   0.0%   100.0%       0.002s       1.84e-05s     Py     131       1   theano.tensor.subtensor.AdvancedIncSubtensor
   0.0%   100.0%       0.001s       2.97e-06s     C      393       3   theano.tensor.opt.MakeVector
   0.0%   100.0%       0.001s       3.90e-06s     C      131       1   theano.tensor.nnet.nnet.SoftmaxGrad
   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)
 
 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  16.7%    16.7%       2.487s       4.48e-04s     C     5548       11   Dot22
  14.8%    31.6%       2.206s       5.61e-03s     C      393        3   Elemwise{Composite{(i0 + (i1 * i2))}}
  14.8%    46.3%       2.195s       2.79e-03s     C      786        6   Elemwise{Composite{sqr(Abs(i0))}}
  12.6%    58.9%       1.872s       4.76e-03s     C      393        3   Gemm{no_inplace}
  10.0%    69.0%       1.494s       5.58e-04s     C     2679       10   Elemwise{add,no_inplace}
   9.8%    78.7%       1.450s       1.85e-03s     C      786        6   Elemwise{Composite{((i0 * i1) - (i2 * i3))}}
   7.1%    85.9%       1.063s       2.03e-03s     C      524        4   Elemwise{mul}
   5.1%    90.9%       0.752s       2.87e-03s     C      262        2   Elemwise{gt,no_inplace}
   4.8%    95.8%       0.715s       2.73e-03s     C      262        2   Elemwise{Composite{Abs((i0 * i1))}}
   1.3%    97.0%       0.193s       2.45e-04s     C      786        6   Reduce{maximum}
   0.7%    97.8%       0.108s       3.43e-05s     Py    3144       12   if{}
   0.6%    98.4%       0.095s       3.23e-05s     C     2941       12   Sum{acc_dtype=float64}
   0.4%    98.8%       0.062s       2.08e-05s     C     3000        2   Elemwise{Composite{((i0 + i1) * GT((i0 + i1), i2))}}
   0.2%    99.0%       0.023s       1.39e-05s     C     1631        2   MaxAndArgmax
   0.1%    99.1%       0.016s       2.02e-05s     C      786        6   Subtensor{int64}
   0.1%    99.2%       0.016s       2.39e-05s     C      655        5   DimShuffle{1,0}
   0.1%    99.3%       0.014s       8.85e-06s     Py    1631        2   AdvancedSubtensor
   0.1%    99.4%       0.013s       2.73e-06s     C     4893        6   DimShuffle{x,0}
   0.1%    99.5%       0.011s       6.76e-06s     Py    1631        2   ARange
   0.1%    99.5%       0.009s       5.53e-06s     C     1631        2   SoftmaxWithBias
   ... (remaining 28 Ops account for   0.48%(0.07s) of the runtime)
 
 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Mflops> <Gflops/s> <Apply name>
   6.9%     6.9%       1.030s       7.86e-03s    131   114                     Elemwise{Composite{sqr(Abs(i0))}}(Dot22.0)
    input 0: dtype=float32, shape=(784, 256), strides=c 
    output 0: dtype=float32, shape=(784, 256), strides=c 
   6.9%    13.8%       1.026s       7.83e-03s    131   154                     Elemwise{Composite{(i0 + (i1 * i2))}}(Gemm{no_inplace}.0, TensorConstant{(1, 1) of 0.0002}, W_dense1)
    input 0: dtype=float32, shape=(784, 256), strides=c 
    input 1: dtype=float32, shape=(1, 1), strides=c 
    input 2: dtype=float32, shape=(784, 256), strides=c 
    output 0: dtype=float32, shape=(784, 256), strides=c 
   6.5%    20.3%       0.966s       7.38e-03s    131   140                     Elemwise{Composite{(i0 + (i1 * i2))}}(Gemm{no_inplace}.0, TensorConstant{(1, 1) of 0.0002}, W_dense2)
    input 0: dtype=float32, shape=(256, 256), strides=c 
    input 1: dtype=float32, shape=(1, 1), strides=c 
    input 2: dtype=float32, shape=(256, 256), strides=c 
    output 0: dtype=float32, shape=(256, 256), strides=c 
   6.4%    26.7%       0.947s       7.23e-03s    131   148                     Gemm{no_inplace}(AllocEmpty{dtype='float32'}.0, if{}.0, x.T, Elemwise{mul}.0, TensorConstant{0.0})
    input 0: dtype=float32, shape=(784, 256), strides=c 
    input 1: dtype=float32, shape=(), strides=c 
    input 2: dtype=float32, shape=(784, 20), strides=(4, 3136) 
    input 3: dtype=float32, shape=(20, 256), strides=c 
    input 4: dtype=float32, shape=(), strides=c 
    output 0: dtype=float32, shape=(784, 256), strides=c 
   6.2%    32.9%       0.921s       7.03e-03s    131   132                     Gemm{no_inplace}(AllocEmpty{dtype='float32'}.0, if{}.0, DimShuffle{1,0}.0, Elemwise{mul}.0, TensorConstant{0.0})
    input 0: dtype=float32, shape=(256, 256), strides=c 
    input 1: dtype=float32, shape=(), strides=c 
    input 2: dtype=float32, shape=(256, 20), strides=(4, 1024) 
    input 3: dtype=float32, shape=(20, 256), strides=c 
    input 4: dtype=float32, shape=(), strides=c 
    output 0: dtype=float32, shape=(256, 256), strides=c 
   6.2%    39.1%       0.921s       7.03e-03s    131    99                     Elemwise{Composite{sqr(Abs(i0))}}(Dot22.0)
    input 0: dtype=float32, shape=(256, 256), strides=c 
    output 0: dtype=float32, shape=(256, 256), strides=c 
   5.8%    44.9%       0.861s       6.57e-03s    131    94                     Dot22(DimShuffle{1,0}.0, Elemwise{mul}.0)
    input 0: dtype=float32, shape=(256, 20), strides=(4, 1024) 
    input 1: dtype=float32, shape=(20, 256), strides=c 
    output 0: dtype=float32, shape=(256, 256), strides=c 
   5.5%    50.4%       0.812s       6.20e-03s    131   108                     Dot22(x.T, Elemwise{mul}.0)
    input 0: dtype=float32, shape=(784, 20), strides=(4, 3136) 
    input 1: dtype=float32, shape=(20, 256), strides=c 
    output 0: dtype=float32, shape=(784, 256), strides=c 
   5.2%    55.6%       0.780s       5.95e-03s    131   166                     Elemwise{Composite{((i0 * i1) - (i2 * i3))}}(TensorConstant{(1, 1) of 0.9}, W_dense1_vel, DimShuffle{x,x}.0, if{}.0)
    input 0: dtype=float32, shape=(1, 1), strides=c 
    input 1: dtype=float32, shape=(784, 256), strides=c 
    input 2: dtype=float32, shape=(1, 1), strides=c 
    input 3: dtype=float32, shape=(784, 256), strides=c 
    output 0: dtype=float32, shape=(784, 256), strides=c 
   4.3%    59.9%       0.638s       4.87e-03s    131    18                     Elemwise{add,no_inplace}(W_dense3, W_dense3_vel)
    input 0: dtype=float32, shape=(256, 10), strides=c 
    input 1: dtype=float32, shape=(256, 10), strides=c 
    output 0: dtype=float32, shape=(256, 10), strides=c 
   3.7%    63.6%       0.554s       4.23e-03s    131   160                     Elemwise{Composite{((i0 * i1) - (i2 * i3))}}(TensorConstant{(1, 1) of 0.9}, W_dense2_vel, DimShuffle{x,x}.0, if{}.0)
    input 0: dtype=float32, shape=(1, 1), strides=c 
    input 1: dtype=float32, shape=(256, 256), strides=c 
    input 2: dtype=float32, shape=(1, 1), strides=c 
    input 3: dtype=float32, shape=(256, 256), strides=c 
    output 0: dtype=float32, shape=(256, 256), strides=c 
   3.5%    67.2%       0.525s       4.01e-03s    131    39                     Elemwise{gt,no_inplace}(Elemwise{add,no_inplace}.0, TensorConstant{(1, 1) of 0})
    input 0: dtype=float32, shape=(20, 256), strides=c 
    input 1: dtype=int8, shape=(1, 1), strides=c 
    output 0: dtype=int8, shape=(20, 256), strides=c 
   3.4%    70.6%       0.510s       3.90e-03s    131    20                     Elemwise{add,no_inplace}(W_dense2, W_dense2_vel)
    input 0: dtype=float32, shape=(256, 256), strides=c 
    input 1: dtype=float32, shape=(256, 256), strides=c 
    output 0: dtype=float32, shape=(256, 256), strides=c 
   3.0%    73.6%       0.445s       3.40e-03s    131   100                     Elemwise{mul}(Dot22.0, Elemwise{gt,no_inplace}.0)
    input 0: dtype=float32, shape=(20, 256), strides=c 
    input 1: dtype=int8, shape=(20, 256), strides=c 
    output 0: dtype=float32, shape=(20, 256), strides=c 
   2.5%    76.1%       0.373s       2.49e-04s   1500     5                     Dot22(x, W_dense1)
    input 0: dtype=float32, shape=(20, 784), strides=c 
    input 1: dtype=float32, shape=(784, 256), strides=c 
    output 0: dtype=float32, shape=(20, 256), strides=c 
   2.5%    78.6%       0.373s       2.85e-03s    131    44                     Elemwise{mul,no_inplace}(Elemwise{add,no_inplace}.0, Elemwise{gt,no_inplace}.0)
    input 0: dtype=float32, shape=(20, 256), strides=c 
    input 1: dtype=int8, shape=(20, 256), strides=c 
    output 0: dtype=float32, shape=(20, 256), strides=c 
   2.4%    81.0%       0.364s       2.78e-03s    131    45                     Elemwise{Composite{Abs((i0 * i1))}}(Elemwise{add,no_inplace}.0, Elemwise{gt,no_inplace}.0)
    input 0: dtype=float32, shape=(20, 256), strides=c 
    input 1: dtype=int8, shape=(20, 256), strides=c 
    output 0: dtype=float32, shape=(20, 256), strides=c 
   2.4%    83.4%       0.351s       2.68e-03s    131    61                     Elemwise{Composite{Abs((i0 * i1))}}(Elemwise{add,no_inplace}.0, Elemwise{gt,no_inplace}.0)
    input 0: dtype=float32, shape=(20, 256), strides=c 
    input 1: dtype=int8, shape=(20, 256), strides=c 
    output 0: dtype=float32, shape=(20, 256), strides=c 
   2.2%    85.6%       0.326s       2.49e-03s    131    22                     Elemwise{add,no_inplace}(W_dense1, W_dense1_vel)
    input 0: dtype=float32, shape=(784, 256), strides=c 
    input 1: dtype=float32, shape=(784, 256), strides=c 
    output 0: dtype=float32, shape=(784, 256), strides=c 
   1.6%    87.2%       0.242s       1.85e-03s    131    88                     Elemwise{Composite{sqr(Abs(i0))}}(Dot22.0)
    input 0: dtype=float32, shape=(256, 10), strides=c 
    output 0: dtype=float32, shape=(256, 10), strides=c 
   ... (remaining 168 Apply instances account for 12.77%(1.90s) of the runtime)
 
 Memory Profile (the max between all functions in that profile)
 (Sparse variables are ignored)
 (For values in brackets, it's for linker = c|py
 ---
    Max if no gc (allow_gc=False): 10093KB (9041KB)
    CPU: 10093KB (9041KB)
    GPU: 0KB (0KB)
 ---
    Max if linker=cvm(default): 2754KB (3815KB)
    CPU: 2754KB (3815KB)
    GPU: 0KB (0KB)
 ---
    Memory saved if views are used: 0KB (0KB)
    Memory saved if inplace ops are used: 0KB (0KB)
    Memory saved if gc is enabled: 7339KB (5225KB)
 ---
 
    This list is based on all functions in the profile
    <Sum apply outputs (bytes)> <Apply outputs shape> <created/inplace/view> <Apply node>
 
        802816B  [(784, 256)] c Elemwise{add,no_inplace}(W_dense1, W_dense1_vel)
        802816B  [(784, 256)] c Elemwise{Composite{(i0 + (i1 * i2))}}(Gemm{no_inplace}.0, TensorConstant{(1, 1) of 0.0002}, W_dense1)
        802816B  [(784, 256)] c Elemwise{Composite{((i0 * i1) - (i2 * i3))}}(TensorConstant{(1, 1) of 0.9}, W_dense1_vel, DimShuffle{x,x}.0, if{}.0)
        802816B  [(784, 256)] c Gemm{no_inplace}(AllocEmpty{dtype='float32'}.0, if{}.0, x.T, Elemwise{mul}.0, TensorConstant{0.0})
        802816B  [(784, 256)] c Dot22(x.T, Elemwise{mul}.0)
        802816B  [(784, 256)] c AllocEmpty{dtype='float32'}(Shape_i{1}.0, Shape_i{0}.0)
        802816B  [(784, 256)] c if{}(Elemwise{isnan,no_inplace}.0, Alloc.0, Elemwise{Composite{(i0 + (i1 * i2))}}.0)
        802816B  [(784, 256)] c Elemwise{Composite{sqr(Abs(i0))}}(Dot22.0)
        262144B  [(256, 256)] c Elemwise{Composite{sqr(Abs(i0))}}(Dot22.0)
        262144B  [(256, 256)] c Elemwise{Composite{((i0 * i1) - (i2 * i3))}}(TensorConstant{(1, 1) of 0.9}, W_dense2_vel, DimShuffle{x,x}.0, if{}.0)
        262144B  [(256, 256)] c Dot22(DimShuffle{1,0}.0, Elemwise{mul}.0)
        262144B  [(256, 256)] c if{}(Elemwise{isnan,no_inplace}.0, Alloc.0, Elemwise{Composite{(i0 + (i1 * i2))}}.0)
        262144B  [(256, 256)] c Gemm{no_inplace}(AllocEmpty{dtype='float32'}.0, if{}.0, DimShuffle{1,0}.0, Elemwise{mul}.0, TensorConstant{0.0})
        262144B  [(256, 256)] c Elemwise{add,no_inplace}(W_dense2, W_dense2_vel)
        262144B  [(256, 256)] c DimShuffle{1,0}(W_dense2)
        262144B  [(256, 256)] c AllocEmpty{dtype='float32'}(Shape_i{1}.0, Shape_i{0}.0)
        262144B  [(256, 256)] c Elemwise{Composite{(i0 + (i1 * i2))}}(Gemm{no_inplace}.0, TensorConstant{(1, 1) of 0.0002}, W_dense2)
         62720B  [(784, 20)] c DimShuffle{1,0}(x)
         20480B  [(20, 256)] c Elemwise{mul,no_inplace}(Elemwise{add,no_inplace}.0, Elemwise{gt,no_inplace}.0)
         20480B  [(20, 256)] c Elemwise{mul}(Dot22.0, Elemwise{gt,no_inplace}.0)
   ... (remaining 168 Apply account for 460180B/9345684B ((4.92%)) of the Apply with dense outputs sizes)
 
    <created/inplace/view> is taken from the Op's declaration.
    Apply nodes marked 'inplace' or 'view' may actually allocate memory, this is not reported here. If you use DebugMode, warnings will be emitted in those cases.