Created
September 10, 2019 16:12
-
-
Save heiner/25a4bd8e38fb33fbefc4c59e069ea475 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
==50689== NVPROF is profiling process 50689, command: python simpleembed.py | |
==50689== Profiling application: python simpleembed.py | |
==50689== Profiling result: | |
Type Time(%) Time Calls Avg Min Max Name | |
GPU activities: 57.66% 248.18ms 30 8.2726ms 5.6467ms 13.019ms maxwell_scudnn_128x32_stridedB_splitK_interior_nn | |
16.02% 68.946ms 30 2.2982ms 1.0702ms 4.6368ms void calc_bias_diff<int=2, float, float, int=128, int=0>(cudnnTensorStruct, float const *, cudnnTensorStruct, float*, float, float, int) | |
4.22% 18.172ms 30 605.73us 2.6240us 945.47us _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZNS0_16copy_kernel_implIffEEvRNS_14TensorIteratorEEUlfE_EEvS5_RKT_EUliE0_EEviT1_ | |
3.22% 13.858ms 30 461.94us 100.42us 1.1248ms maxwell_scudnn_128x32_stridedB_small_nn | |
3.09% 13.316ms 30 443.88us 113.18us 1.0100ms maxwell_scudnn_128x32_relu_interior_nn | |
2.45% 10.565ms 100 105.65us 100.67us 113.41us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__merge_sort::MergeAgent<thrust::device_ptr<long>, thrust::device_ptr<long>, long, ThrustLTOp<long, bool=0>, thrust::detail::integral_constant<bool, bool=1>>, bool, thrust::device_ptr<long>, thrust::device_ptr<long>, long, long*, long*, ThrustLTOp<long, bool=0>, long*, long>(thrust::device_ptr<long>, thrust::device_ptr<long>, long, long, bool=0, ThrustLTOp<long, bool=0>, bool, bool=1, thrust::detail::integral_constant<bool, bool=1>) | |
2.45% 10.524ms 60 175.40us 15.904us 546.14us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZNS0_21threshold_kernel_implIfEEvRNS_14TensorIteratorET_S6_EUlffE_EEvS5_RKS6_EUliE_EEviT1_ | |
2.24% 9.6553ms 10 965.53us 965.05us 966.24us void at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::compute_grad_weight<float>(long*, float*, at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::compute_grad_weight<float>, long, long, at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::compute_grad_weight<float>, long, at::AccumulateType<long*, bool=1>::type*, int, long) | |
2.10% 9.0427ms 10 904.27us 903.74us 905.44us void at::native::_GLOBAL__N__63_tmpxft_00007a48_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::MaxPoolBackward<float, float>(int, float const *, long const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00007a48_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::MaxPoolBackward<float, float>*) | |
1.75% 7.5375ms 10 753.75us 752.32us 755.64us void indexSelectLargeIndex<float, unsigned int, int=2, int=2, int=-2, bool=1>(TensorInfo<float, unsigned int>, TensorInfo<float, unsigned int>, TensorInfo<long, unsigned int>, int, int, unsigned int, unsigned int, long) | |
1.62% 6.9521ms 30 231.74us 30.080us 598.91us void op_generic_tensor_kernel<int=2, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, dimArray, reducedDivisorArray, bool) | |
0.73% 3.1329ms 10 313.29us 312.32us 314.11us void at::native::_GLOBAL__N__63_tmpxft_00007a48_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::MaxPoolForward<float, float>(int, float const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00007a48_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::MaxPoolForward<float, float>*, long*) | |
0.70% 3.0293ms 10 302.93us 300.70us 308.26us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__merge_sort::BlockSortAgent<thrust::device_ptr<long>, thrust::device_ptr<long>, long, ThrustLTOp<long, bool=0>, thrust::detail::integral_constant<bool, bool=1>, thrust::detail::integral_constant<bool, bool=0>>, bool, thrust::device_ptr<long>, thrust::device_ptr<long>, long, long*, long*, ThrustLTOp<long, bool=0>>(thrust::device_ptr<long>, thrust::device_ptr<long>, long, long, bool=0, ThrustLTOp<long, bool=0>, bool) | |
0.40% 1.7338ms 100 17.338us 9.1200us 23.104us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__merge_sort::PartitionAgent<thrust::device_ptr<long>, long, ThrustLTOp<long, bool=0>>, bool, thrust::device_ptr<long>, long*, long, unsigned long, long*, ThrustLTOp<long, bool=0>, long, int>(thrust::device_ptr<long>, long, long, bool=0, ThrustLTOp<long, bool=0>, thrust::cuda_cub::__merge_sort::PartitionAgent<thrust::device_ptr<long>, long, ThrustLTOp<long, bool=0>>, bool, thrust::device_ptr<long>, long*) | |
0.25% 1.0856ms 10 108.56us 108.03us 109.38us _ZN84_GLOBAL__N__60_tmpxft_00007aab_00000000_11_Distributions_compute_75_cpp1_ii_c3aa7ee643distribution_elementwise_grid_stride_kernelImLi2EZZZN2at6native18random_kernel_cudaERNS1_14TensorIteratorEmlPNS1_9GeneratorEENKUlvE_clEvENKUlvE4_clEvEUlP24curandStatePhilox4_32_10E_ZNS_27distribution_nullary_kernelIlmLi2ESB_ZZZNS2_18random_kernel_cudaES4_mlS6_ENKS7_clEvENKS8_clEvEUlmE_EEvS4_PNS1_13CUDAGeneratorERKT2_T3_EUlimE_EEviSt4pairImmET1_SG_ | |
0.16% 695.42us 20 34.770us 17.376us 51.776us [CUDA memcpy DtoD] | |
0.13% 546.30us 10 54.629us 52.927us 55.520us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__unique_by_key::UniqueByKeyAgent<thrust::device_ptr<long>, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::device_ptr<long>, thrust::equal_to<long>, int, int*>, thrust::device_ptr<long>, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::device_ptr<long>, thrust::equal_to<long>, int*, int, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long>(thrust::device_ptr<long>, int, thrust::use_default, thrust::use_default, thrust::use_default, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::device_ptr<long>, long) | |
0.07% 312.13us 10 31.212us 30.880us 31.904us void at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::sum_and_scatter<float>(long*, float*, long, at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::sum_and_scatter<float>, long, at::AccumulateType<long*, bool=1>::type const *, long const *, long, long) | |
0.07% 290.94us 10 29.094us 28.224us 29.535us sgemm_32x32x32_NT_vec | |
0.07% 286.56us 10 28.655us 28.096us 29.248us sgemm_32x32x32_TN_vec | |
0.05% 235.74us 10 23.574us 23.168us 24.000us maxwell_sgemm_128x64_nn | |
0.05% 229.12us 10 22.911us 22.624us 23.232us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<long>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<long>, thrust::cuda_cub::__transform::always_true_predicate>, long>(thrust::use_default, thrust::use_default) | |
0.04% 183.04us 90 2.0330us 1.5040us 4.6720us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZZZNS0_19addcdiv_cuda_kernelERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlfffE_EEvS4_RKT_EUliE_EEviT1_ | |
0.04% 166.43us 158 1.0530us 768ns 3.7120us [CUDA memset] | |
0.04% 156.42us 81 1.9310us 1.2160us 3.5200us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS4_RKT_EUliE_EEviT1_ | |
0.03% 138.37us 90 1.5370us 1.1840us 15.008us void kernelPointwiseApply2<Tensor_sqrt_Float_Op, float, float, unsigned int, int=1, int=1>(OffsetInfo<float, unsigned int, int=1>, OffsetInfo<float, unsigned int, int=1>, unsigned int, Tensor_sqrt_Float_Op) | |
0.03% 136.13us 60 2.2680us 1.3760us 3.9680us cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | |
0.03% 131.33us 90 1.4590us 1.2160us 2.6560us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZZZNS0_19addcmul_cuda_kernelERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlfffE_EEvS4_RKT_EUliE_EEviT1_ | |
0.03% 129.47us 90 1.4380us 1.0880us 2.8800us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZNS0_23gpu_kernel_with_scalarsIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS5_RKT_EUlfE0_EEvS5_SB_EUliE_EEviT1_ | |
0.03% 112.54us 90 1.2500us 1.0560us 2.4960us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZNS0_23gpu_kernel_with_scalarsIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS5_RKT_EUlfE0_EEvS5_SD_EUliE_EEviT1_ | |
0.03% 108.00us 10 10.799us 10.624us 10.976us at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::krn_partial_segment_offset(long*, long const *, long const *, long const *, long) | |
0.02% 107.55us 10 10.755us 10.496us 11.392us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__scan::ScanAgent<thrust::device_ptr<long>, thrust::device_ptr<long>, thrust::plus<long>, int, long, thrust::detail::integral_constant<bool, bool=0>>, thrust::device_ptr<long>, thrust::device_ptr<long>, thrust::plus<long>, int, thrust::cuda_cub::cub::ScanTileState<long, bool=1>, thrust::cuda_cub::__scan::AddInitToExclusiveScan<long, thrust::plus<long>>>(thrust::device_ptr<long>, thrust::device_ptr<long>, long, thrust::plus<long>, int, long) | |
0.02% 88.159us 10 8.8150us 8.6400us 9.0240us _ZN2at6native13reduce_kernelILi512ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT0_ | |
0.02% 86.304us 10 8.6300us 8.1920us 9.0240us void at::native::reduce_kernel<int=512, at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, int=4>>(float) | |
0.02% 83.262us 60 1.3870us 1.1190us 2.6560us cudnn::maxwell::gemm::computeBOffsetsKernel(cudnn::maxwell::gemm::ComputeBOffsetsParams) | |
0.01% 62.240us 30 2.0740us 1.3760us 3.0720us cudnn::maxwell::gemm::computeWgradOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | |
0.01% 52.639us 12 4.3860us 864ns 35.520us [CUDA memcpy HtoD] | |
0.01% 50.592us 30 1.6860us 1.4080us 1.9200us void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float) | |
0.01% 41.088us 20 2.0540us 1.2480us 2.9440us _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZNS0_23gpu_kernel_with_scalarsIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS5_RKT_EUlfE0_EEvS5_SB_EUliE0_EEviT1_ | |
0.01% 33.984us 30 1.1320us 960ns 2.4640us [CUDA memcpy DtoH] | |
0.01% 27.776us 10 2.7770us 2.6560us 2.9120us _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZNS0_23gpu_kernel_with_scalarsIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS5_RKT_EUlfE_EEvS5_SD_EUliE0_EEviT1_ | |
0.01% 27.552us 10 2.7550us 2.5920us 3.1360us _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS4_RKT_EUliE0_EEviT1_ | |
0.01% 26.720us 10 2.6720us 2.4960us 2.9760us _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZZZNS0_15neg_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlfE_EEvS4_RKT_EUliE0_EEviT1_ | |
0.01% 26.016us 10 2.6010us 2.4960us 2.7520us at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::krn_partials_per_segment(long*, long const *, long, long) | |
0.01% 24.704us 10 2.4700us 2.3680us 2.5600us _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlvE_EEvS4_RKT_EUliE0_EEviT1_ | |
0.00% 18.816us 10 1.8810us 1.6000us 2.4000us void kernelPointwiseApply2<TensorPowOp<float, int=2>, float, float, unsigned int, int=1, int=1>(OffsetInfo<int=2, float, float>, OffsetInfo<TensorPowOp<float, int=2>, float, unsigned int>, float, float) | |
0.00% 17.344us 10 1.7340us 1.6640us 1.9200us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZZZNS0_15div_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE0_clEvEUlfE_EEvS4_RKT_EUliE_EEviT1_ | |
0.00% 16.576us 10 1.6570us 1.5680us 1.7600us void kernelPointwiseApply2<TensorPowOp<float, int=1>, float, float, unsigned int, int=1, int=1>(OffsetInfo<int=1, float, float>, OffsetInfo<TensorPowOp<float, int=1>, float, unsigned int>, float, float) | |
0.00% 14.624us 10 1.4620us 1.4080us 1.5680us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__scan::InitAgent<thrust::cuda_cub::cub::ScanTileState<long, bool=1>, int>, thrust::cuda_cub::cub::ScanTileState<long, bool=1>, int>(bool=1, thrust::cuda_cub::cub::ScanTileState<long, bool=1>) | |
0.00% 12.384us 10 1.2380us 1.2160us 1.3120us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__unique_by_key::InitAgent<thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*, int>, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long, int*>(bool=1, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*) | |
API calls: 46.39% 3.85477s 26 148.26ms 4.9060us 3.85164s cudaMalloc | |
44.98% 3.73799s 20 186.90ms 6.2020us 3.73784s cudaDeviceSynchronize | |
4.69% 390.10ms 69 5.6537ms 1.6030us 39.560ms cudaStreamSynchronize | |
3.55% 295.34ms 5 59.068ms 921ns 295.33ms cudaFree | |
0.19% 16.143ms 1431 11.281us 5.7860us 127.83us cudaLaunchKernel | |
0.03% 2.3127ms 6549 353ns 227ns 15.723us cudaGetDevice | |
0.02% 1.8191ms 4 454.78us 397.03us 498.10us cudaGetDeviceProperties | |
0.02% 1.7329ms 378 4.5840us 120ns 226.29us cuDeviceGetAttribute | |
0.02% 1.5570ms 3280 474ns 250ns 18.329us cudaSetDevice | |
0.02% 1.4655ms 158 9.2750us 2.6380us 47.200us cudaMemsetAsync | |
0.02% 1.3464ms 3 448.81us 22.396us 1.2946ms cudaMemcpy | |
0.01% 1.1193ms 59 18.970us 7.4570us 160.37us cudaMemcpyAsync | |
0.01% 1.1062ms 2 553.08us 110.49us 995.66us cudaHostAlloc | |
0.01% 891.47us 4 222.87us 207.85us 246.84us cuDeviceTotalMem | |
0.01% 721.68us 16 45.105us 15.948us 379.98us cudaStreamCreateWithFlags | |
0.00% 402.34us 8 50.292us 11.015us 279.55us cudaStreamCreateWithPriority | |
0.00% 289.82us 100 2.8980us 1.9730us 13.529us cudaFuncGetAttributes | |
0.00% 277.51us 1551 178ns 91ns 1.1910us cudaGetLastError | |
0.00% 193.34us 120 1.6110us 1.0500us 4.0170us cudaEventRecord | |
0.00% 156.79us 4 39.197us 32.734us 54.488us cuDeviceGetName | |
0.00% 133.90us 329 406ns 218ns 18.548us cudaDeviceGetAttribute | |
0.00% 69.315us 72 962ns 374ns 17.370us cudaEventCreateWithFlags | |
0.00% 59.417us 520 114ns 92ns 284ns cudaPeekAtLastError | |
0.00% 42.002us 30 1.4000us 798ns 12.811us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags | |
0.00% 7.9490us 29 274ns 99ns 1.6330us cudaGetDeviceCount | |
0.00% 5.2270us 2 2.6130us 1.9590us 3.2680us cuDeviceGetPCIBusId | |
0.00% 4.0520us 2 2.0260us 1.6980us 2.3540us cudaHostGetDevicePointer | |
0.00% 2.6600us 2 1.3300us 1.2800us 1.3800us cudaDeviceGetStreamPriorityRange | |
0.00% 2.4270us 6 404ns 154ns 1.0610us cuDeviceGet | |
0.00% 1.7530us 1 1.7530us 1.7530us 1.7530us cuInit | |
0.00% 1.5940us 4 398ns 207ns 679ns cuDeviceGetCount | |
0.00% 864ns 4 216ns 169ns 269ns cuDeviceGetUuid | |
0.00% 789ns 1 789ns 789ns 789ns cuDriverGetVersion |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment