Created
November 12, 2018 18:03
-
-
Save csullivan/ba3e279b4cd233402ee18a8208ae6385 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
INFO:root:start with arguments Namespace(batch_size=64, benchmark=0, brightness=0, contrast=0, data_nthreads=4, data_train='/dataset/mxnet_imagenet/train.rec', data_train_idx='', data_val='/dataset/mxnet_imagenet/val.rec', data_val_idx='', disp_batches=20, dtype='float32', fill_value=127, gc_threshold=0.5, gc_type='none', gpus='0', image_shape='3,224,224', initializer='default', is_nnp=False, kv_store='device', load_epoch=None, loss='', lr=0.1, lr_factor=0.1, lr_step_epochs='30,60', macrobatch_size=0, max_crop_size=-1, max_random_area=1, max_random_aspect_ratio=0, max_random_h=0, max_random_l=0, max_random_rotate_angle=0, max_random_s=0, max_random_scale=1, max_random_shear_ratio=0, min_crop_size=-1, min_random_area=1, min_random_aspect_ratio=None, min_random_scale=1, model_prefix=None, mom=0.9, monitor=0, network='resnet', num_classes=1000, num_epochs=80, num_examples=1281167, num_layers=50, optimizer='sgd', pad_size=0, pca_noise=0, profile_server_suffix='', profile_worker_suffix='', random_crop=0, random_mirror=0, random_resized_crop=0, rgb_mean='123.68,116.779,103.939', rgb_std='1,1,1', saturation=0, save_period=1, test_io=0, top_k=0, warmup_epochs=5, warmup_strategy='linear', wd=0.0001) | |
[09:57:51] src/io/iter_image_recordio_2.cc:172: ImageRecordIOParser2: /dataset/mxnet_imagenet/train.rec, use 4 threads for decoding.. | |
==64160== NVPROF is profiling process 64160, command: python ../example/image-classification/train_imagenet.py --batch-size=64 --network resnet --num-layers 50 --data-train=/dataset/mxnet_imagenet/train.rec --data-val=/dataset/mxnet_imagenet/val.rec --gpus 0 | |
[09:57:54] src/io/iter_image_recordio_2.cc:172: ImageRecordIOParser2: /dataset/mxnet_imagenet/val.rec, use 4 threads for decoding.. | |
INFO:root:Epoch[0] Batch [0-20] Speed: 171.32 samples/sec accuracy=0.001488 | |
INFO:root:Epoch[0] Batch [20-40] Speed: 174.11 samples/sec accuracy=0.003906 | |
INFO:root:Epoch[0] Batch [40-60] Speed: 174.08 samples/sec accuracy=0.003125 | |
INFO:root:Epoch[0] Batch [60-80] Speed: 160.74 samples/sec accuracy=0.000781 | |
INFO:root:Epoch[0] Batch [80-100] Speed: 173.10 samples/sec accuracy=0.003125 | |
Traceback (most recent call last): | |
File "../example/image-classification/train_imagenet.py", line 67, in <module> | |
fit.fit(args, sym, data.get_rec_iter,args.is_nnp) | |
File "/localdisk/cs/projects/mxnet/example/image-classification/common/fit.py", line 334, in fit | |
monitor=monitor) | |
File "/localdisk/cs/projects/mxnet/python/mxnet/module/base_module.py", line 539, in fit | |
self.update_metric(eval_metric, data_batch.label) | |
File "/localdisk/cs/projects/mxnet/python/mxnet/module/module.py", line 775, in update_metric | |
self._exec_group.update_metric(eval_metric, labels, pre_sliced) | |
File "/localdisk/cs/projects/mxnet/python/mxnet/module/executor_group.py", line 639, in update_metric | |
eval_metric.update_dict(labels_, preds) | |
File "/localdisk/cs/projects/mxnet/python/mxnet/metric.py", line 304, in update_dict | |
metric.update_dict(labels, preds) | |
File "/localdisk/cs/projects/mxnet/python/mxnet/metric.py", line 132, in update_dict | |
self.update(label, pred) | |
File "/localdisk/cs/projects/mxnet/python/mxnet/metric.py", line 418, in update | |
pred_label = pred_label.asnumpy().astype('int32') | |
File "/localdisk/cs/projects/mxnet/python/mxnet/ndarray/ndarray.py", line 1980, in asnumpy | |
ctypes.c_size_t(data.size))) | |
KeyboardInterrupt | |
==64160== Profiling application: python ../example/image-classification/train_imagenet.py --batch-size=64 --network resnet --num-layers 50 --data-train=/dataset/mxnet_imagenet/train.rec --data-val=/dataset/mxnet_imagenet/val.rec --gpus 0 | |
==64160== Profiling result: | |
Time(%) Time Calls Avg Min Max Name | |
14.11% 5.12519s 114680 44.691us 800ns 963.84us [CUDA memcpy DtoD] | |
12.45% 4.52008s 3392 1.3326ms 743.03us 4.1731ms maxwell_scudnn_128x128_stridedB_splitK_interior_nn | |
7.90% 2.86789s 2548 1.1255ms 349.23us 3.8028ms void cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>(float, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>, cudnnTensorStruct, float const *, float, float const , float, cudnnTensorStruct*, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1> const *, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>*, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1> const *, cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1> const , cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1> const , cudnn::detail::bn_bw_1C11_kernel_new<float, float, float2, int=512, bool=1, int=1>) | |
7.71% 2.80096s 3379 828.93us 602.10us 1.7109ms maxwell_scudnn_128x128_relu_interior_nn | |
6.33% 2.30001s 2866 802.51us 611.16us 1.5982ms maxwell_scudnn_128x128_stridedB_interior_nn | |
5.73% 2.08083s 13770 151.11us 896ns 1.4894ms cuda_ew_add_float_float_float | |
4.97% 1.80416s 5099 353.83us 44.417us 1.4775ms cuda_ew_relu_backprop_float_float_float | |
4.55% 1.65426s 2550 648.73us 207.05us 1.7187ms void cudnn::detail::bn_fw_tr_1C11_kernel_new<float, float, int=512, bool=1, int=1>(cudnnTensorStruct, float const *, cudnn::detail::bn_fw_tr_1C11_kernel_new<float, float, int=512, bool=1, int=1>, cudnnTensorStruct*, float const *, float const , cudnnTensorStruct*, cudnnTensorStruct*, cudnnTensorStruct**, float const *, float const *, float const *, cudnnTensorStruct*, cudnnTensorStruct*) | |
3.43% 1.24641s 5100 244.39us 29.633us 1.0455ms cuda_ew_relu_float_float | |
3.12% 1.13328s 320 3.5415ms 1.0984ms 5.2278ms void cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>*, kernel_grad_params, int, int, float, int, int, int) | |
2.03% 737.92ms 310 2.3804ms 789.05us 3.2676ms maxwell_scudnn_128x64_stridedB_splitK_interior_nn | |
1.90% 689.92ms 227 3.0393ms 634.07us 4.4328ms void cudnn::detail::dgrad_engine<float, int=128, int=6, int=7, int=3, int=3, int=5, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad_engine<float, int=128, int=6, int=7, int=3, int=3, int=5, bool=1>*, kernel_grad_params, int, int, float, int, int, int) | |
1.85% 671.11ms 308 2.1789ms 1.9798ms 2.2902ms maxwell_scudnn_128x64_stridedB_splitK_medium_nn | |
1.64% 595.89ms 1231 484.07us 386.09us 619.12us maxwell_sgemmBatched_64x64_raggedMn_nt | |
1.63% 593.23ms 614 966.17us 897.60us 1.1178ms maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | |
1.50% 543.79ms 215 2.5292ms 1.0122ms 3.2193ms void cudnn::detail::dgrad_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1>*, kernel_grad_params, int, int, float, int, int, int) | |
1.39% 504.01ms 614 820.87us 746.84us 884.06us maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile228n_nt | |
1.35% 491.70ms 318 1.5462ms 1.3001ms 1.7274ms maxwell_scudnn_128x128_stridedB_splitK_small_nn | |
1.23% 448.22ms 312 1.4366ms 1.2686ms 1.5698ms maxwell_scudnn_128x128_relu_small_nn | |
1.23% 445.13ms 1836 242.44us 121.38us 466.58us void cudnn::detail::bn_bw_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=14>(float, float, float, float, cudnnTensorStruct, float const *, cudnn::detail::bn_bw_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=14>, float const , cudnn::detail::bn_bw_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=14>, cudnnTensorStruct*, float const *, float*, float const *, float const , float const , float, cudnn::reduced_divisor, int, float*, cudnn::detail::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool) | |
1.10% 400.55ms 514 779.28us 694.71us 1.6731ms maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile418n_nt | |
1.02% 369.77ms 512 722.20us 298.63us 856.83us maxwell_scudnn_128x64_stridedB_interior_nn | |
1.01% 367.43ms 791 464.51us 992ns 3.5666ms [CUDA memcpy HtoD] | |
0.99% 358.36ms 103 3.4792ms 3.2921ms 3.6933ms maxwell_scudnn_128x64_stridedB_splitK_large_nn | |
0.98% 354.21ms 718 493.34us 433.58us 714.46us maxwell_sgemmBatched_64x64_raggedMn_nn | |
0.81% 295.37ms 1836 160.88us 82.115us 303.98us void cudnn::detail::bn_fw_tr_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=20>(cudnnTensorStruct, float const *, cudnn::detail::bn_fw_tr_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=20>, cudnnTensorStruct*, float const *, float const , float, float, float*, float const *, float const *, float const *, float, float, cudnn::reduced_divisor, int, float*, cudnn::detail::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool) | |
0.57% 206.27ms 10500 19.644us 928ns 465.90us void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float) | |
0.52% 190.60ms 308 618.84us 288.55us 821.69us maxwell_scudnn_128x64_relu_interior_nn | |
0.51% 184.71ms 103 1.7933ms 1.6518ms 2.1236ms maxwell_scudnn_128x64_relu_medium_nn | |
0.47% 170.67ms 101 1.6898ms 1.6513ms 1.7405ms void cudnn::detail::pooling_bw_kernel_max<float, float, cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>(cudnnTensorStruct, float const *, cudnn::detail::pooling_bw_kernel_max<float, float, cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>, float const , cudnn::detail::pooling_bw_kernel_max<float, float, cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>, float const , cudnn::detail::pooling_bw_kernel_max<float, float, cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>, cudnnTensorStruct*, cudnnPoolingStruct, float, cudnnPoolingStruct, int, cudnn::reduced_divisor, float) | |
0.44% 159.79ms 1024 156.04us 70.946us 487.66us void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, float>(cudnn::winograd_nonfused::WinogradDeltaParams<float, float>) | |
0.41% 149.55ms 1024 146.05us 68.866us 436.33us void cudnn::winograd_nonfused::winogradWgradData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | |
0.40% 145.62ms 15859 9.1820us 1.3440us 113.96us _ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS0_12SGDMomKernelEJPfS4_S4_S4_fffffNS_9OpReqTypeEEEEviDpT0_ | |
0.30% 108.51ms 102 1.0638ms 1.0317ms 1.0988ms void cudnn::detail::pooling_bw_kernel_avg<float, float, cudnn::detail::averpooling_func<float>, int=1, bool=0>(cudnnTensorStruct, float const *, float const , cudnn::detail::pooling_bw_kernel_avg<float, float, cudnn::detail::averpooling_func<float>, int=1, bool=0>, float const , cudnn::detail::pooling_bw_kernel_avg<float, float, cudnn::detail::averpooling_func<float>, int=1, bool=0>, cudnnTensorStruct*, cudnnPoolingStruct, float, cudnnPoolingStruct, int, cudnn::reduced_divisor, float) | |
0.29% 106.13ms 816 130.06us 64.066us 245.64us void cudnn::detail::bn_bw_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>(float, float, float, float, cudnnTensorStruct, float const *, cudnn::detail::bn_bw_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>, float const , cudnn::detail::bn_bw_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>, cudnnTensorStruct*, float const *, float*, float const *, float const , float const , float, cudnn::reduced_divisor, int, float*, cudnn::detail::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool) | |
0.28% 103.32ms 926 111.57us 78.274us 487.28us void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | |
0.26% 95.456ms 203 470.23us 448.62us 708.76us void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, int=8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, int=2, float>, float>, mshadow::expr::Plan<mshadow::expr::ReduceWithAxisExp<mshadow::red::maximum, mshadow::Tensor<mshadow::gpu, int=3, float>, float, int=3, bool=1, int=2>, float>>(mshadow::gpu, long, mshadow::Shape<int=2>, int=2) | |
0.25% 91.035ms 926 98.310us 64.770us 424.30us void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | |
0.24% 87.239ms 30 2.9080ms 1.0659ms 6.1650ms void cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1, int=512>(int, int, int, float const *, int, cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1, int=512>*, float const , kernel_grad_params, int, float, int, int, int, int) | |
0.24% 85.922ms 102 842.38us 837.44us 849.53us void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, int=0, bool=0>(cudnnTensorStruct, float const *, cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, int=0, bool=0>, cudnnTensorStruct*, cudnnPoolingStruct, float, cudnnPoolingStruct, int, cudnn::reduced_divisor, float) | |
0.21% 77.026ms 816 94.394us 46.689us 176.61us void cudnn::detail::bn_fw_tr_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>(cudnnTensorStruct, float const *, cudnn::detail::bn_fw_tr_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>, cudnnTensorStruct*, float const *, float const , float, float, float*, float const *, float const *, float const *, float, float, cudnn::reduced_divisor, int, float*, cudnn::detail::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool) | |
0.20% 72.070ms 1024 70.381us 7.7120us 176.74us void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, float>) | |
0.19% 68.834ms 30 2.2945ms 405.65us 5.3507ms cudnn_maxwell_gcgemm_64x64_nt_batched | |
0.18% 63.999ms 26 2.4615ms 431.69us 5.9051ms cudnn_maxwell_gcgemm_64x64_tn_batched | |
0.18% 63.816ms 926 68.915us 5.8560us 128.39us void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | |
0.12% 44.394ms 22 2.0179ms 449.20us 7.1602ms cudnn_maxwell_cgemm_64x64_tn_batched | |
0.11% 38.407ms 45 853.48us 324.39us 2.6628ms void transpose_readWrite_alignment_kernel<float2, float2, int=1, bool=0, int=6, int=4, int=4>(cublasTransposeParams<float2>, float2 const *, float2*, float2 const *) | |
0.10% 35.924ms 102 352.20us 342.54us 363.66us cuda_softmax_float_float_ri_2_rr_1 | |
0.10% 35.905ms 24 1.4961ms 962.66us 2.1208ms void cudnn::detail::implicit_convolve_sgemm<float, float, int=128, int=6, int=7, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>(int, int, int, float const *, int, float*, cudnn::detail::implicit_convolve_sgemm<float, float, int=128, int=6, int=7, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>*, kernel_conv_params, int, float, float, int, float, float, int, int) | |
0.10% 34.716ms 20808 1.6680us 896ns 14.176us cuda_ew_mul_float_float_float | |
0.07% 27.011ms 12 2.2509ms 2.0932ms 2.5630ms void cudnn::detail::implicit_convolve_sgemm<float, float, int=512, int=6, int=8, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>(int, int, int, float const *, int, float*, cudnn::detail::implicit_convolve_sgemm<float, float, int=512, int=6, int=8, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>*, kernel_conv_params, int, float, float, int, float, float, int, int) | |
0.07% 26.579ms 8 3.3224ms 1.6161ms 5.5576ms void cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1, int=512>(int, int, int, float const *, int, cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1, int=512>*, float const , kernel_grad_params, int, float, int, int, int, int) | |
0.07% 26.296ms 42 626.11us 94.083us 3.1846ms void fft2d_r2c_16x16<float>(float2*, float const *, int, int, int, int, int, int, int, int) | |
0.06% 21.539ms 21109 1.0200us 832ns 11.521us cuda_broadcast_float_float_r1 | |
0.06% 20.332ms 44 462.09us 18.273us 970.05us void fft1d_r2c_32<float, float, float2, bool=0, bool=0>(float2*, float const *, int, int3, int3, int2, int2) | |
0.05% 19.839ms 21804 909ns 345ns 70.886us [CUDA memset] | |
0.05% 19.450ms 1742 11.165us 6.4330us 74.050us void cudnn::winograd::generateWinogradTilesKernel<int=0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | |
0.05% 19.431ms 6 3.2384ms 1.5888ms 4.2962ms void cudnn::detail::implicit_convolve_sgemm<float, float, int=128, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>(int, int, int, float const *, int, float*, cudnn::detail::implicit_convolve_sgemm<float, float, int=128, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>*, kernel_conv_params, int, float, float, int, float, float, int, int) | |
0.05% 18.922ms 8 2.3653ms 1.4945ms 3.8455ms void cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=7, int=3, int=3, int=5, bool=1, int=512>(int, int, int, float const *, int, cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=7, int=3, int=3, int=5, bool=1, int=512>*, float const , kernel_grad_params, int, float, int, int, int, int) | |
0.05% 18.690ms 32 584.07us 8.5770us 3.6861ms void flip_filter<float, float>(float*, float const *, int, int, int, int) | |
0.05% 18.576ms 30 619.20us 190.89us 1.3424ms void fft2d_r2c_64x64<float>(float2*, float const *, int, int, int, int, int, int, int, int) | |
0.04% 16.224ms 7813 2.0760us 1.6000us 14.945us cudnn::maxwell::gemm::computeBOffsetsKernel(cudnn::maxwell::gemm::ComputeBOffsetsParams) | |
0.04% 15.682ms 21 746.77us 98.564us 3.7564ms void fft2d_c2r_16x16<float, bool=0>(float*, float2*, int, int, int, int, int, int, int, int, int, int, float, float, int, float*, float*) | |
0.04% 14.836ms 7485 1.9820us 1.6320us 13.569us cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | |
0.04% 14.682ms 10 1.4682ms 849.72us 1.8616ms void cudnn::detail::explicit_convolve_sgemm<float, int, int=512, int=6, int=8, int=3, int=3, int=5, int=0, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=512, int=6, int=8, int=3, int=3, int=5, int=0, bool=1>*, kernel_conv_params, int, int, float, float, int, float const *, float const *) | |
0.04% 13.994ms 9 1.5549ms 938.65us 2.0763ms void cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=6, int=7, int=3, int=3, int=5, int=0, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=6, int=7, int=3, int=3, int=5, int=0, bool=1>*, kernel_conv_params, int, int, float, float, int, float const *, float const *) | |
0.04% 13.514ms 102 132.49us 130.66us 134.40us void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, int=1, bool=0>(cudnnTensorStruct, float const *, cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, int=1, bool=0>, cudnnTensorStruct*, cudnnPoolingStruct, float, cudnnPoolingStruct, int, cudnn::reduced_divisor, float) | |
0.04% 13.124ms 2 6.5621ms 3.5665ms 9.5578ms void cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=7, int=5, int=4, int=5, bool=1, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=7, int=5, int=4, int=5, bool=1, bool=1>*, kernel_grad_params, int, int, float, int, int) | |
0.03% 12.303ms 32 384.46us 108.23us 793.66us void fft1d_c2r_32<float2, float, float, bool=0, bool=1, bool=0, bool=0>(float*, float2 const *, int, int3, int3, int2, int, float, float, float*, float*) | |
0.03% 12.276ms 23 533.72us 122.28us 1.4918ms void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const *, float*, int) | |
0.03% 12.183ms 16 761.44us 64.770us 3.2093ms void fft2d_r2c_32x32<float, unsigned int=1, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool) | |
0.03% 11.152ms 10506 1.0610us 832ns 11.200us cuda_ew_subtractf_float_float_float | |
0.03% 10.297ms 5200 1.9800us 1.6640us 12.289us cuda_cudnn_bn_inv_var_float_float | |
0.03% 9.2667ms 4431 2.0910us 1.7280us 14.976us cudnn::maxwell::gemm::computeWgradOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | |
0.02% 8.8095ms 15 587.30us 184.90us 1.2894ms void fft2d_c2r_64x64<float, bool=0>(float*, float2*, int, int, int, int, int, int, int, int, int, int, float, float, int, float*, float*) | |
0.02% 7.9116ms 26 304.29us 107.17us 752.92us void fft2d_r2c_32x32<float, unsigned int=0, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool) | |
0.02% 6.3417ms 102 62.173us 60.418us 64.578us maxwell_sgemm_128x64_raggedMn_nn_splitK | |
0.02% 5.7797ms 16 361.23us 146.92us 980.77us cudnn_maxwell_cgemm_64x64_nt_batched | |
0.01% 5.3689ms 1 5.3689ms 5.3689ms 5.3689ms void cudnn::detail::dgrad2d_alg1_1<float, int=0, int=4, int=6, int=3, int=2, int=4, bool=1, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad2d_alg1_1<float, int=0, int=4, int=6, int=3, int=2, int=4, bool=1, bool=1>*, kernel_grad_params, int, int, float, int, int) | |
0.01% 5.3033ms 4 1.3258ms 545.01us 2.1067ms void cudnn::detail::implicit_convolve_sgemm<float, float, int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>(int, int, int, float const *, int, float*, cudnn::detail::implicit_convolve_sgemm<float, float, int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>*, kernel_conv_params, int, float, float, int, float, float, int, int) | |
0.01% 4.8451ms 3 1.6150ms 523.28us 2.9760ms void cudnn::detail::explicit_convolve_sgemm<float, int, int=1024, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=1024, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>*, kernel_conv_params, int, int, float, float, int, float const *, float const *) | |
0.01% 4.5626ms 4 1.1407ms 74.563us 3.4402ms void fft2d_r2c_32x32<float, unsigned int=1, bool=1>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool) | |
0.01% 4.3777ms 1 4.3777ms 4.3777ms 4.3777ms void cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=6, int=5, int=4, int=4, bool=1, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=6, int=5, int=4, int=4, bool=1, bool=1>*, kernel_grad_params, int, int, float, int, int) | |
0.01% 4.2742ms 8 534.27us 199.56us 1.3761ms void fft2d_c2r_32x32<float, bool=0, unsigned int=1, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*) | |
0.01% 4.2706ms 204 20.934us 2.2080us 40.385us cuda_reshape_float_float | |
0.01% 4.2381ms 102 41.549us 40.609us 42.849us sgemm_32x32x32_NN_vec | |
0.01% 4.1431ms 18 230.17us 119.56us 594.77us void fft2d_c2r_32x32<float, bool=0, unsigned int=0, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*) | |
0.01% 4.0963ms 3 1.3654ms 1.2589ms 1.4230ms maxwell_scudnn_128x128_stridedB_small_nn | |
0.01% 4.0324ms 102 39.533us 38.273us 41.153us sgemm_128x128x8_NN_vec | |
0.01% 3.4697ms 1 3.4697ms 3.4697ms 3.4697ms void cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>*, kernel_conv_params, int, int, float, float, int, float const *, float const *) | |
0.01% 2.7427ms 12 228.56us 22.305us 760.60us void fft1d_r2c_32<float, float, float2, bool=1, bool=0>(float2*, float const *, int, int3, int3, int2, int2) | |
0.00% 1.7888ms 1 1.7888ms 1.7888ms 1.7888ms maxwell_sgemmBatched_128x128_raggedMn_nt | |
0.00% 1.7836ms 1 1.7836ms 1.7836ms 1.7836ms maxwell_scudnn_128x64_relu_small_nn | |
0.00% 1.4451ms 1 1.4451ms 1.4451ms 1.4451ms maxwell_scudnn_128x64_stridedB_small_nn | |
0.00% 1.1413ms 418 2.7300us 800ns 124.61us void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, int=8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, int=2, float>, float>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float>>(mshadow::gpu, long, mshadow::Shape<int=2>, int=2) | |
0.00% 514.74us 102 5.0460us 4.8640us 5.2810us cuda_reduce_nd_float_float_ri_2_rr_1 | |
0.00% 365.68us 203 1.8010us 1.3120us 3.6160us [CUDA memcpy DtoH] | |
0.00% 329.81us 157 2.1000us 768ns 20.577us _ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS1_11op_with_reqINS1_10set_to_intILi0EEELi1EEEJPfEEEviDpT0_ | |
0.00% 294.57us 102 2.8870us 2.8160us 3.0400us cuda_broadcast_float_float_r2 | |
0.00% 165.22us 102 1.6190us 1.5680us 1.8570us cuda_onehot_float_float | |
==64160== API calls: | |
Time(%) Time Calls Avg Min Max Name | |
77.58% 58.4737s 34290 1.7053ms 1.9210us 628.90ms cudaStreamSynchronize | |
4.62% 3.48341s 234 14.886ms 6.6080us 2.39895s cudaHostAlloc | |
4.13% 3.10948s 290 10.722ms 822ns 970.41ms cudaFree | |
2.31% 1.74128s 82524 21.100us 5.3400us 7.1451ms cudaLaunch | |
2.24% 1.68961s 24 70.400ms 27.952us 1.68667s cudaStreamCreateWithFlags | |
2.00% 1.50539s 98835 15.231us 7.7970us 5.8659ms cudaMemcpy | |
1.59% 1.19708s 16839 71.089us 5.7390us 24.144ms cudaMemcpy2DAsync | |
1.40% 1.05378s 1475 714.43us 5.7990us 12.682ms cudaMalloc | |
1.30% 980.29ms 353 2.7770ms 465.68us 12.658ms cudaEventSynchronize | |
0.90% 677.32ms 82204 8.2390us 4.0330us 23.363ms cuLaunchKernel | |
0.87% 657.45ms 1147 573.19us 176.19us 29.057ms cudaMemGetInfo | |
0.23% 176.09ms 20805 8.4630us 3.6110us 754.52us cudaMemset | |
0.23% 171.78ms 547286 313ns 129ns 1.8970ms cudaSetupArgument | |
0.17% 131.23ms 1 131.23ms 131.23ms 131.23ms cuModuleUnload | |
0.14% 102.25ms 157 651.26us 12.750us 9.4380ms cudaFreeHost | |
0.07% 55.783ms 12 4.6486ms 3.6473ms 7.6539ms cuModuleLoadDataEx | |
0.06% 42.201ms 82524 511ns 155ns 1.5581ms cudaConfigureCall | |
0.04% 28.708ms 79581 360ns 128ns 873.28us cudaGetLastError | |
0.02% 18.024ms 4 4.5061ms 2.5248ms 6.4497ms cudaGetDeviceProperties | |
0.01% 10.312ms 439 23.490us 143ns 3.8474ms cuDeviceGetAttribute | |
0.01% 10.079ms 16637 605ns 195ns 1.9837ms cudaPeekAtLastError | |
0.01% 8.2062ms 999 8.2140us 2.4140us 131.46us cudaMemsetAsync | |
0.01% 8.1472ms 1670 4.8780us 1.4250us 66.866us cudaBindTexture | |
0.01% 6.1833ms 1289 4.7960us 603ns 88.255us cudaEventRecord | |
0.01% 6.0867ms 4085 1.4900us 337ns 33.656us cudaSetDevice | |
0.01% 5.1365ms 3597 1.4280us 349ns 65.811us cudaGetDevice | |
0.01% 5.0087ms 5 1.0017ms 257.94us 3.5336ms cuDeviceTotalMem | |
0.00% 2.8492ms 5 569.84us 124.03us 2.2879ms cuDeviceGetName | |
0.00% 2.5166ms 1670 1.5060us 507ns 70.883us cudaUnbindTexture | |
0.00% 2.4189ms 353 6.8520us 2.9270us 24.985us cudaEventElapsedTime | |
0.00% 2.2848ms 12 190.40us 27.940us 1.5934ms cudaStreamCreateWithPriority | |
0.00% 1.7816ms 717 2.4840us 556ns 519.25us cudaStreamWaitEvent | |
0.00% 1.4907ms 302 4.9360us 480ns 204.85us cudaEventCreateWithFlags | |
0.00% 1.1473ms 4 286.82us 27.690us 904.01us cudaStreamCreate | |
0.00% 641.12us 612 1.0470us 419ns 7.5480us cuCtxSetCurrent | |
0.00% 499.45us 170 2.9370us 617ns 60.886us cudaEventDestroy | |
0.00% 474.48us 102 4.6510us 3.4420us 14.501us cudaEventQuery | |
0.00% 288.07us 214 1.3460us 291ns 26.789us cudaDeviceGetAttribute | |
0.00% 125.52us 32 3.9220us 1.7410us 23.488us cudaEventCreate | |
0.00% 75.843us 12 6.3200us 970ns 58.410us cuModuleGetFunction | |
0.00% 47.110us 3 15.703us 6.0240us 32.539us cudaHostGetDevicePointer | |
0.00% 17.946us 6 2.9910us 297ns 5.3140us cudaGetDeviceCount | |
0.00% 11.204us 3 3.7340us 2.3930us 6.3840us cudaDeviceGetStreamPriorityRange | |
0.00% 6.6900us 5 1.3380us 290ns 3.3380us cuInit | |
0.00% 4.7980us 7 685ns 221ns 2.5700us cuDeviceGetCount | |
0.00% 3.5380us 8 442ns 221ns 872ns cuDeviceGet | |
0.00% 2.3270us 4 581ns 307ns 892ns cuDriverGetVersion | |
0.00% 1.5990us 1 1.5990us 1.5990us 1.5990us cuDevicePrimaryCtxRelease | |
0.00% 716ns 1 716ns 716ns 716ns cuDevicePrimaryCtxRetain | |
======== Error: Application returned non-zero code 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment