Created
October 9, 2015 12:41
-
-
Save mmmikael/73b62a911748a33f17ef to your computer and use it in GitHub Desktop.
fcn keras profiliing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Function profiling | |
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:399 | |
Time in 1 calls to Function.__call__: 1.027354e+01s | |
Time in Function.fn.__call__: 1.026697e+01s (99.936%) | |
Time in thunks: 1.017123e+01s (99.004%) | |
Total compile time: 1.421726e+00s | |
Number of Apply nodes: 140 | |
Theano Optimizer time: 1.223497e+00s | |
Theano validate time: 6.331444e-02s | |
Theano Linker time (includes C, CUDA code generation/compiling): 1.472740e-01s | |
Import time 4.217982e-02s | |
Time in all call to theano.grad() 5.796099e-02s | |
Time since theano import 22.311s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
27.1% 27.1% 2.752s 9.17e-01s C 3 3 theano.sandbox.cuda.dnn.GpuDnnConvGradW | |
25.5% 52.6% 2.595s 2.36e-01s C 11 11 theano.tensor.elemwise.Elemwise | |
16.1% 68.6% 1.635s 1.63e+00s Py 1 1 theano.tensor.subtensor.AdvancedIncSubtensor | |
5.9% 74.5% 0.600s 2.00e-01s Py 3 3 theano.tensor.subtensor.AdvancedSubtensor | |
5.8% 80.3% 0.585s 2.93e-01s C 2 2 theano.sandbox.cuda.dnn.GpuDnnConvGradI | |
3.8% 84.1% 0.384s 9.59e-02s C 4 4 theano.tensor.elemwise.Sum | |
3.6% 87.6% 0.362s 3.62e-01s C 1 1 theano.sandbox.cuda.basic_ops.HostFromGpu | |
2.8% 90.4% 0.287s 1.06e-02s C 27 27 theano.sandbox.cuda.basic_ops.GpuElemwise | |
2.3% 92.7% 0.232s 7.74e-02s C 3 3 theano.sandbox.cuda.dnn.GpuDnnConv | |
2.0% 94.7% 0.202s 2.02e-01s Py 1 1 theano.tensor.basic.Nonzero | |
1.4% 96.1% 0.145s 1.45e-01s C 1 1 theano.sandbox.cuda.dnn.GpuDnnSoftmax | |
1.2% 97.3% 0.118s 9.86e-03s C 12 12 theano.sandbox.cuda.basic_ops.GpuContiguous | |
1.0% 98.3% 0.102s 1.02e-01s C 1 1 theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad | |
0.8% 99.1% 0.081s 4.06e-02s C 2 2 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.5% 99.6% 0.054s 5.45e-02s C 1 1 theano.tensor.basic.Alloc | |
0.3% 100.0% 0.033s 1.12e-02s C 3 3 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
0.0% 100.0% 0.003s 3.82e-04s C 8 8 theano.sandbox.cuda.basic_ops.GpuAllocEmpty | |
0.0% 100.0% 0.000s 3.38e-06s C 12 12 theano.sandbox.cuda.basic_ops.GpuDimShuffle | |
0.0% 100.0% 0.000s 1.55e-06s C 22 22 theano.compile.ops.Shape_i | |
0.0% 100.0% 0.000s 2.57e-06s C 9 9 theano.tensor.opt.MakeVector | |
... (remaining 4 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
27.1% 27.1% 2.752s 9.17e-01s C 3 3 GpuDnnConvGradW{algo='none', inplace=True} | |
17.1% 44.1% 1.736s 1.74e+00s C 1 1 Elemwise{Composite{(i0 * log((i1 / i2)))}}[(0, 0)] | |
16.1% 60.2% 1.635s 1.63e+00s Py 1 1 AdvancedIncSubtensor{inplace=False, set_instead_of_inc=False} | |
5.9% 66.1% 0.600s 2.00e-01s Py 3 3 AdvancedSubtensor | |
5.8% 71.9% 0.585s 2.93e-01s C 2 2 GpuDnnConvGradI{algo='none', inplace=True} | |
4.4% 76.3% 0.451s 4.51e-01s C 1 1 Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((-i3) / i4) + i5))}}[(0, 0)] | |
3.8% 80.1% 0.390s 3.90e-01s C 1 1 Elemwise{clip,no_inplace} | |
3.6% 83.7% 0.362s 3.62e-01s C 1 1 HostFromGpu | |
2.6% 86.3% 0.265s 1.33e-01s C 2 2 Sum{axis=[1], acc_dtype=float64} | |
2.3% 88.6% 0.232s 7.74e-02s C 3 3 GpuDnnConv{algo='small', inplace=True} | |
2.0% 90.6% 0.202s 2.02e-01s Py 1 1 Nonzero | |
1.4% 92.0% 0.145s 1.45e-01s C 1 1 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='accurate'} | |
1.3% 93.3% 0.132s 4.40e-02s C 3 3 GpuElemwise{Composite{((i0 * i1) + (i0 * i1 * sgn(i2)))}}[(0, 1)] | |
1.2% 94.5% 0.118s 9.86e-03s C 12 12 GpuContiguous | |
1.2% 95.6% 0.118s 5.92e-02s C 2 2 Sum{acc_dtype=float64} | |
1.0% 96.6% 0.102s 1.02e-01s C 1 1 GpuDnnSoftmaxGrad{tensor_format='bc01', mode='channel', algo='accurate'} | |
0.9% 97.5% 0.093s 1.04e-02s C 9 9 GpuElemwise{Add}[(0, 0)] | |
0.8% 98.3% 0.081s 4.06e-02s C 2 2 GpuFromHost | |
0.6% 98.9% 0.060s 2.00e-02s C 3 3 GpuElemwise{Composite{(i0 * (i1 + Abs(i1)))},no_inplace} | |
0.5% 99.5% 0.054s 5.45e-02s C 1 1 Alloc | |
... (remaining 26 Ops account for 0.54%(0.05s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
19.9% 19.9% 2.023s 2.02e+00s 1 124 GpuDnnConvGradW{algo='none', inplace=True}(GpuContiguous.0, GpuContiguous.0, GpuAllocEmpty.0, GpuDnnConvDesc{border_mode=(1, 1), subsample=(1, 1), conv_mode='conv'}.0, Constant{1.0}, Constant{0.0}) | |
17.1% 37.0% 1.736s 1.74e+00s 1 96 Elemwise{Composite{(i0 * log((i1 / i2)))}}[(0, 0)](AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, InplaceDimShuffle{0,x}.0) | |
16.1% 53.0% 1.635s 1.63e+00s 1 97 AdvancedIncSubtensor{inplace=False, set_instead_of_inc=False}(Alloc.0, Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((-i3) / i4) + i5))}}[(0, 0)].0, Subtensor{int64}.0, Subtensor{int64}.0, Subtensor{int64}.0) | |
4.4% 57.5% 0.451s 4.51e-01s 1 95 Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((-i3) / i4) + i5))}}[(0, 0)](AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}, AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, Elemwise{TrueDiv}[(0, 0)].0) | |
4.1% 61.6% 0.420s 4.20e-01s 1 113 GpuDnnConvGradW{algo='none', inplace=True}(GpuContiguous.0, GpuContiguous.0, GpuAllocEmpty.0, GpuDnnConvDesc{border_mode=(1, 1), subsample=(1, 1), conv_mode='conv'}.0, Constant{1.0}, Constant{0.0}) | |
3.8% 65.4% 0.390s 3.90e-01s 1 91 Elemwise{clip,no_inplace}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}) | |
3.6% 69.0% 0.362s 3.62e-01s 1 84 HostFromGpu(GpuReshape{4}.0) | |
3.5% 72.5% 0.357s 3.57e-01s 1 89 AdvancedSubtensor(HostFromGpu.0, Subtensor{int64}.0, Subtensor{int64}.0, Subtensor{int64}.0) | |
3.5% 76.0% 0.353s 3.53e-01s 1 123 GpuDnnConvGradI{algo='none', inplace=True}(GpuContiguous.0, GpuContiguous.0, GpuAllocEmpty.0, GpuDnnConvDesc{border_mode=(1, 1), subsample=(1, 1), conv_mode='conv'}.0, Constant{1.0}, Constant{0.0}) | |
3.0% 79.0% 0.309s 3.09e-01s 1 134 GpuDnnConvGradW{algo='none', inplace=True}(GpuContiguous.0, GpuContiguous.0, GpuAllocEmpty.0, GpuDnnConvDesc{border_mode=(1, 1), subsample=(1, 1), conv_mode='conv'}.0, Constant{1.0}, Constant{0.0}) | |
2.3% 81.3% 0.232s 2.32e-01s 1 112 GpuDnnConvGradI{algo='none', inplace=True}(GpuContiguous.0, GpuContiguous.0, GpuAllocEmpty.0, GpuDnnConvDesc{border_mode=(1, 1), subsample=(1, 1), conv_mode='conv'}.0, Constant{1.0}, Constant{0.0}) | |
2.0% 83.3% 0.202s 2.02e-01s 1 23 Nonzero(<TensorType(float32, 4D)>) | |
1.7% 85.0% 0.173s 1.73e-01s 1 49 AdvancedSubtensor(y, Subtensor{int64}.0, Subtensor{int64}.0, Subtensor{int64}.0) | |
1.5% 86.5% 0.153s 1.53e-01s 1 92 Sum{axis=[1], acc_dtype=float64}(Elemwise{clip,no_inplace}.0) | |
1.4% 87.9% 0.145s 1.45e-01s 1 79 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='accurate'}(GpuContiguous.0) | |
1.4% 89.3% 0.145s 1.45e-01s 1 65 GpuDnnConv{algo='small', inplace=True}(GpuContiguous.0, GpuContiguous.0, GpuAllocEmpty.0, GpuDnnConvDesc{border_mode=(1, 1), subsample=(1, 1), conv_mode='conv'}.0, Constant{1.0}, Constant{0.0}) | |
1.2% 90.5% 0.118s 1.18e-01s 1 110 GpuContiguous(GpuElemwise{Composite{((i0 * i1) + (i0 * i1 * sgn(i2)))}}[(0, 1)].0) | |
1.1% 91.6% 0.114s 1.14e-01s 1 98 Sum{acc_dtype=float64}(Elemwise{Composite{(i0 * log((i1 / i2)))}}[(0, 0)].0) | |
1.1% 92.7% 0.112s 1.12e-01s 1 58 Sum{axis=[1], acc_dtype=float64}(AdvancedSubtensor.0) | |
1.0% 93.7% 0.102s 1.02e-01s 1 104 GpuDnnSoftmaxGrad{tensor_format='bc01', mode='channel', algo='accurate'}(GpuContiguous.0, GpuContiguous.0) | |
... (remaining 120 Apply instances account for 6.27%(0.64s) of the runtime) | |
Function profiling | |
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:401 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 1.072570e+00s | |
Number of Apply nodes: 153 | |
Theano Optimizer time: 9.243569e-01s | |
Theano validate time: 6.185031e-02s | |
Theano Linker time (includes C, CUDA code generation/compiling): 1.079371e-01s | |
Import time 1.020193e-03s | |
Time in all call to theano.grad() 5.796099e-02s | |
Time since theano import 22.320s | |
Function profiling | |
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:403 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 3.260720e-01s | |
Number of Apply nodes: 60 | |
Theano Optimizer time: 2.644930e-01s | |
Theano validate time: 7.456541e-03s | |
Theano Linker time (includes C, CUDA code generation/compiling): 4.158711e-02s | |
Import time 9.508133e-04s | |
Time in all call to theano.grad() 5.796099e-02s | |
Time since theano import 22.320s | |
Function profiling | |
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:405 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 5.981460e-01s | |
Number of Apply nodes: 75 | |
Theano Optimizer time: 3.249922e-01s | |
Theano validate time: 9.876013e-03s | |
Theano Linker time (includes C, CUDA code generation/compiling): 4.987216e-02s | |
Import time 2.160072e-04s | |
Time in all call to theano.grad() 5.796099e-02s | |
Time since theano import 22.320s | |
Function profiling | |
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:407 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 4.345279e-01s | |
Number of Apply nodes: 88 | |
Theano Optimizer time: 3.557758e-01s | |
Theano validate time: 1.038814e-02s | |
Theano Linker time (includes C, CUDA code generation/compiling): 5.503392e-02s | |
Import time 0.000000e+00s | |
Time in all call to theano.grad() 5.796099e-02s | |
Time since theano import 22.320s | |
Function profiling | |
================== | |
Message: Sum of all(5) printed profiles at exit excluding Scan op profile. | |
Time in 1 calls to Function.__call__: 1.027354e+01s | |
Time in Function.fn.__call__: 1.026697e+01s (99.936%) | |
Time in thunks: 1.017123e+01s (99.004%) | |
Total compile time: 3.853042e+00s | |
Number of Apply nodes: 140 | |
Theano Optimizer time: 3.093115e+00s | |
Theano validate time: 1.528854e-01s | |
Theano Linker time (includes C, CUDA code generation/compiling): 4.017043e-01s | |
Import time 4.436684e-02s | |
Time in all call to theano.grad() 5.796099e-02s | |
Time since theano import 22.320s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
27.1% 27.1% 2.752s 9.17e-01s C 3 3 theano.sandbox.cuda.dnn.GpuDnnConvGradW | |
25.5% 52.6% 2.595s 2.36e-01s C 11 11 theano.tensor.elemwise.Elemwise | |
16.1% 68.6% 1.635s 1.63e+00s Py 1 1 theano.tensor.subtensor.AdvancedIncSubtensor | |
5.9% 74.5% 0.600s 2.00e-01s Py 3 3 theano.tensor.subtensor.AdvancedSubtensor | |
5.8% 80.3% 0.585s 2.93e-01s C 2 2 theano.sandbox.cuda.dnn.GpuDnnConvGradI | |
3.8% 84.1% 0.384s 9.59e-02s C 4 4 theano.tensor.elemwise.Sum | |
3.6% 87.6% 0.362s 3.62e-01s C 1 1 theano.sandbox.cuda.basic_ops.HostFromGpu | |
2.8% 90.4% 0.287s 1.06e-02s C 27 27 theano.sandbox.cuda.basic_ops.GpuElemwise | |
2.3% 92.7% 0.232s 7.74e-02s C 3 3 theano.sandbox.cuda.dnn.GpuDnnConv | |
2.0% 94.7% 0.202s 2.02e-01s Py 1 1 theano.tensor.basic.Nonzero | |
1.4% 96.1% 0.145s 1.45e-01s C 1 1 theano.sandbox.cuda.dnn.GpuDnnSoftmax | |
1.2% 97.3% 0.118s 9.86e-03s C 12 12 theano.sandbox.cuda.basic_ops.GpuContiguous | |
1.0% 98.3% 0.102s 1.02e-01s C 1 1 theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad | |
0.8% 99.1% 0.081s 4.06e-02s C 2 2 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.5% 99.6% 0.054s 5.45e-02s C 1 1 theano.tensor.basic.Alloc | |
0.3% 100.0% 0.033s 1.12e-02s C 3 3 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
0.0% 100.0% 0.003s 3.82e-04s C 8 8 theano.sandbox.cuda.basic_ops.GpuAllocEmpty | |
0.0% 100.0% 0.000s 3.38e-06s C 12 12 theano.sandbox.cuda.basic_ops.GpuDimShuffle | |
0.0% 100.0% 0.000s 1.55e-06s C 22 22 theano.compile.ops.Shape_i | |
0.0% 100.0% 0.000s 2.57e-06s C 9 9 theano.tensor.opt.MakeVector | |
... (remaining 4 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
27.1% 27.1% 2.752s 9.17e-01s C 3 3 GpuDnnConvGradW{algo='none', inplace=True} | |
17.1% 44.1% 1.736s 1.74e+00s C 1 1 Elemwise{Composite{(i0 * log((i1 / i2)))}}[(0, 0)] | |
16.1% 60.2% 1.635s 1.63e+00s Py 1 1 AdvancedIncSubtensor{inplace=False, set_instead_of_inc=False} | |
5.9% 66.1% 0.600s 2.00e-01s Py 3 3 AdvancedSubtensor | |
5.8% 71.9% 0.585s 2.93e-01s C 2 2 GpuDnnConvGradI{algo='none', inplace=True} | |
4.4% 76.3% 0.451s 4.51e-01s C 1 1 Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((-i3) / i4) + i5))}}[(0, 0)] | |
3.8% 80.1% 0.390s 3.90e-01s C 1 1 Elemwise{clip,no_inplace} | |
3.6% 83.7% 0.362s 3.62e-01s C 1 1 HostFromGpu | |
2.6% 86.3% 0.265s 1.33e-01s C 2 2 Sum{axis=[1], acc_dtype=float64} | |
2.3% 88.6% 0.232s 7.74e-02s C 3 3 GpuDnnConv{algo='small', inplace=True} | |
2.0% 90.6% 0.202s 2.02e-01s Py 1 1 Nonzero | |
1.4% 92.0% 0.145s 1.45e-01s C 1 1 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='accurate'} | |
1.3% 93.3% 0.132s 4.40e-02s C 3 3 GpuElemwise{Composite{((i0 * i1) + (i0 * i1 * sgn(i2)))}}[(0, 1)] | |
1.2% 94.5% 0.118s 9.86e-03s C 12 12 GpuContiguous | |
1.2% 95.6% 0.118s 5.92e-02s C 2 2 Sum{acc_dtype=float64} | |
1.0% 96.6% 0.102s 1.02e-01s C 1 1 GpuDnnSoftmaxGrad{tensor_format='bc01', mode='channel', algo='accurate'} | |
0.9% 97.5% 0.093s 1.04e-02s C 9 9 GpuElemwise{Add}[(0, 0)] | |
0.8% 98.3% 0.081s 4.06e-02s C 2 2 GpuFromHost | |
0.6% 98.9% 0.060s 2.00e-02s C 3 3 GpuElemwise{Composite{(i0 * (i1 + Abs(i1)))},no_inplace} | |
0.5% 99.5% 0.054s 5.45e-02s C 1 1 Alloc | |
... (remaining 26 Ops account for 0.54%(0.05s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
19.9% 19.9% 2.023s 2.02e+00s 1 124 GpuDnnConvGradW{algo='none', inplace=True}(GpuContiguous.0, GpuContiguous.0, GpuAllocEmpty.0, GpuDnnConvDesc{border_mode=(1, 1), subsample=(1, 1), conv_mode='conv'}.0, Constant{1.0}, Constant{0.0}) | |
17.1% 37.0% 1.736s 1.74e+00s 1 96 Elemwise{Composite{(i0 * log((i1 / i2)))}}[(0, 0)](AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, InplaceDimShuffle{0,x}.0) | |
16.1% 53.0% 1.635s 1.63e+00s 1 97 AdvancedIncSubtensor{inplace=False, set_instead_of_inc=False}(Alloc.0, Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((-i3) / i4) + i5))}}[(0, 0)].0, Subtensor{int64}.0, Subtensor{int64}.0, Subtensor{int64}.0) | |
4.4% 57.5% 0.451s 4.51e-01s 1 95 Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((-i3) / i4) + i5))}}[(0, 0)](AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}, AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, Elemwise{TrueDiv}[(0, 0)].0) | |
4.1% 61.6% 0.420s 4.20e-01s 1 113 GpuDnnConvGradW{algo='none', inplace=True}(GpuContiguous.0, GpuContiguous.0, GpuAllocEmpty.0, GpuDnnConvDesc{border_mode=(1, 1), subsample=(1, 1), conv_mode='conv'}.0, Constant{1.0}, Constant{0.0}) | |
3.8% 65.4% 0.390s 3.90e-01s 1 91 Elemwise{clip,no_inplace}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}) | |
3.6% 69.0% 0.362s 3.62e-01s 1 84 HostFromGpu(GpuReshape{4}.0) | |
3.5% 72.5% 0.357s 3.57e-01s 1 89 AdvancedSubtensor(HostFromGpu.0, Subtensor{int64}.0, Subtensor{int64}.0, Subtensor{int64}.0) | |
3.5% 76.0% 0.353s 3.53e-01s 1 123 GpuDnnConvGradI{algo='none', inplace=True}(GpuContiguous.0, GpuContiguous.0, GpuAllocEmpty.0, GpuDnnConvDesc{border_mode=(1, 1), subsample=(1, 1), conv_mode='conv'}.0, Constant{1.0}, Constant{0.0}) | |
3.0% 79.0% 0.309s 3.09e-01s 1 134 GpuDnnConvGradW{algo='none', inplace=True}(GpuContiguous.0, GpuContiguous.0, GpuAllocEmpty.0, GpuDnnConvDesc{border_mode=(1, 1), subsample=(1, 1), conv_mode='conv'}.0, Constant{1.0}, Constant{0.0}) | |
2.3% 81.3% 0.232s 2.32e-01s 1 112 GpuDnnConvGradI{algo='none', inplace=True}(GpuContiguous.0, GpuContiguous.0, GpuAllocEmpty.0, GpuDnnConvDesc{border_mode=(1, 1), subsample=(1, 1), conv_mode='conv'}.0, Constant{1.0}, Constant{0.0}) | |
2.0% 83.3% 0.202s 2.02e-01s 1 23 Nonzero(<TensorType(float32, 4D)>) | |
1.7% 85.0% 0.173s 1.73e-01s 1 49 AdvancedSubtensor(y, Subtensor{int64}.0, Subtensor{int64}.0, Subtensor{int64}.0) | |
1.5% 86.5% 0.153s 1.53e-01s 1 92 Sum{axis=[1], acc_dtype=float64}(Elemwise{clip,no_inplace}.0) | |
1.4% 87.9% 0.145s 1.45e-01s 1 79 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='accurate'}(GpuContiguous.0) | |
1.4% 89.3% 0.145s 1.45e-01s 1 65 GpuDnnConv{algo='small', inplace=True}(GpuContiguous.0, GpuContiguous.0, GpuAllocEmpty.0, GpuDnnConvDesc{border_mode=(1, 1), subsample=(1, 1), conv_mode='conv'}.0, Constant{1.0}, Constant{0.0}) | |
1.2% 90.5% 0.118s 1.18e-01s 1 110 GpuContiguous(GpuElemwise{Composite{((i0 * i1) + (i0 * i1 * sgn(i2)))}}[(0, 1)].0) | |
1.1% 91.6% 0.114s 1.14e-01s 1 98 Sum{acc_dtype=float64}(Elemwise{Composite{(i0 * log((i1 / i2)))}}[(0, 0)].0) | |
1.1% 92.7% 0.112s 1.12e-01s 1 58 Sum{axis=[1], acc_dtype=float64}(AdvancedSubtensor.0) | |
1.0% 93.7% 0.102s 1.02e-01s 1 104 GpuDnnSoftmaxGrad{tensor_format='bc01', mode='channel', algo='accurate'}(GpuContiguous.0, GpuContiguous.0) | |
... (remaining 120 Apply instances account for 6.27%(0.64s) of the runtime) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment