Skip to content

Instantly share code, notes, and snippets.

@heiner
Created September 10, 2019 16:12
Show Gist options
  • Save heiner/25a4bd8e38fb33fbefc4c59e069ea475 to your computer and use it in GitHub Desktop.
Save heiner/25a4bd8e38fb33fbefc4c59e069ea475 to your computer and use it in GitHub Desktop.
==50689== NVPROF is profiling process 50689, command: python simpleembed.py
==50689== Profiling application: python simpleembed.py
==50689== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 57.66% 248.18ms 30 8.2726ms 5.6467ms 13.019ms maxwell_scudnn_128x32_stridedB_splitK_interior_nn
16.02% 68.946ms 30 2.2982ms 1.0702ms 4.6368ms void calc_bias_diff<int=2, float, float, int=128, int=0>(cudnnTensorStruct, float const *, cudnnTensorStruct, float*, float, float, int)
4.22% 18.172ms 30 605.73us 2.6240us 945.47us _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZNS0_16copy_kernel_implIffEEvRNS_14TensorIteratorEEUlfE_EEvS5_RKT_EUliE0_EEviT1_
3.22% 13.858ms 30 461.94us 100.42us 1.1248ms maxwell_scudnn_128x32_stridedB_small_nn
3.09% 13.316ms 30 443.88us 113.18us 1.0100ms maxwell_scudnn_128x32_relu_interior_nn
2.45% 10.565ms 100 105.65us 100.67us 113.41us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__merge_sort::MergeAgent<thrust::device_ptr<long>, thrust::device_ptr<long>, long, ThrustLTOp<long, bool=0>, thrust::detail::integral_constant<bool, bool=1>>, bool, thrust::device_ptr<long>, thrust::device_ptr<long>, long, long*, long*, ThrustLTOp<long, bool=0>, long*, long>(thrust::device_ptr<long>, thrust::device_ptr<long>, long, long, bool=0, ThrustLTOp<long, bool=0>, bool, bool=1, thrust::detail::integral_constant<bool, bool=1>)
2.45% 10.524ms 60 175.40us 15.904us 546.14us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZNS0_21threshold_kernel_implIfEEvRNS_14TensorIteratorET_S6_EUlffE_EEvS5_RKS6_EUliE_EEviT1_
2.24% 9.6553ms 10 965.53us 965.05us 966.24us void at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::compute_grad_weight<float>(long*, float*, at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::compute_grad_weight<float>, long, long, at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::compute_grad_weight<float>, long, at::AccumulateType<long*, bool=1>::type*, int, long)
2.10% 9.0427ms 10 904.27us 903.74us 905.44us void at::native::_GLOBAL__N__63_tmpxft_00007a48_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::MaxPoolBackward<float, float>(int, float const *, long const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00007a48_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::MaxPoolBackward<float, float>*)
1.75% 7.5375ms 10 753.75us 752.32us 755.64us void indexSelectLargeIndex<float, unsigned int, int=2, int=2, int=-2, bool=1>(TensorInfo<float, unsigned int>, TensorInfo<float, unsigned int>, TensorInfo<long, unsigned int>, int, int, unsigned int, unsigned int, long)
1.62% 6.9521ms 30 231.74us 30.080us 598.91us void op_generic_tensor_kernel<int=2, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, dimArray, reducedDivisorArray, bool)
0.73% 3.1329ms 10 313.29us 312.32us 314.11us void at::native::_GLOBAL__N__63_tmpxft_00007a48_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::MaxPoolForward<float, float>(int, float const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00007a48_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::MaxPoolForward<float, float>*, long*)
0.70% 3.0293ms 10 302.93us 300.70us 308.26us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__merge_sort::BlockSortAgent<thrust::device_ptr<long>, thrust::device_ptr<long>, long, ThrustLTOp<long, bool=0>, thrust::detail::integral_constant<bool, bool=1>, thrust::detail::integral_constant<bool, bool=0>>, bool, thrust::device_ptr<long>, thrust::device_ptr<long>, long, long*, long*, ThrustLTOp<long, bool=0>>(thrust::device_ptr<long>, thrust::device_ptr<long>, long, long, bool=0, ThrustLTOp<long, bool=0>, bool)
0.40% 1.7338ms 100 17.338us 9.1200us 23.104us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__merge_sort::PartitionAgent<thrust::device_ptr<long>, long, ThrustLTOp<long, bool=0>>, bool, thrust::device_ptr<long>, long*, long, unsigned long, long*, ThrustLTOp<long, bool=0>, long, int>(thrust::device_ptr<long>, long, long, bool=0, ThrustLTOp<long, bool=0>, thrust::cuda_cub::__merge_sort::PartitionAgent<thrust::device_ptr<long>, long, ThrustLTOp<long, bool=0>>, bool, thrust::device_ptr<long>, long*)
0.25% 1.0856ms 10 108.56us 108.03us 109.38us _ZN84_GLOBAL__N__60_tmpxft_00007aab_00000000_11_Distributions_compute_75_cpp1_ii_c3aa7ee643distribution_elementwise_grid_stride_kernelImLi2EZZZN2at6native18random_kernel_cudaERNS1_14TensorIteratorEmlPNS1_9GeneratorEENKUlvE_clEvENKUlvE4_clEvEUlP24curandStatePhilox4_32_10E_ZNS_27distribution_nullary_kernelIlmLi2ESB_ZZZNS2_18random_kernel_cudaES4_mlS6_ENKS7_clEvENKS8_clEvEUlmE_EEvS4_PNS1_13CUDAGeneratorERKT2_T3_EUlimE_EEviSt4pairImmET1_SG_
0.16% 695.42us 20 34.770us 17.376us 51.776us [CUDA memcpy DtoD]
0.13% 546.30us 10 54.629us 52.927us 55.520us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__unique_by_key::UniqueByKeyAgent<thrust::device_ptr<long>, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::device_ptr<long>, thrust::equal_to<long>, int, int*>, thrust::device_ptr<long>, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::device_ptr<long>, thrust::equal_to<long>, int*, int, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long>(thrust::device_ptr<long>, int, thrust::use_default, thrust::use_default, thrust::use_default, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::device_ptr<long>, long)
0.07% 312.13us 10 31.212us 30.880us 31.904us void at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::sum_and_scatter<float>(long*, float*, long, at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::sum_and_scatter<float>, long, at::AccumulateType<long*, bool=1>::type const *, long const *, long, long)
0.07% 290.94us 10 29.094us 28.224us 29.535us sgemm_32x32x32_NT_vec
0.07% 286.56us 10 28.655us 28.096us 29.248us sgemm_32x32x32_TN_vec
0.05% 235.74us 10 23.574us 23.168us 24.000us maxwell_sgemm_128x64_nn
0.05% 229.12us 10 22.911us 22.624us 23.232us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<long>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<long>, thrust::cuda_cub::__transform::always_true_predicate>, long>(thrust::use_default, thrust::use_default)
0.04% 183.04us 90 2.0330us 1.5040us 4.6720us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZZZNS0_19addcdiv_cuda_kernelERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlfffE_EEvS4_RKT_EUliE_EEviT1_
0.04% 166.43us 158 1.0530us 768ns 3.7120us [CUDA memset]
0.04% 156.42us 81 1.9310us 1.2160us 3.5200us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS4_RKT_EUliE_EEviT1_
0.03% 138.37us 90 1.5370us 1.1840us 15.008us void kernelPointwiseApply2<Tensor_sqrt_Float_Op, float, float, unsigned int, int=1, int=1>(OffsetInfo<float, unsigned int, int=1>, OffsetInfo<float, unsigned int, int=1>, unsigned int, Tensor_sqrt_Float_Op)
0.03% 136.13us 60 2.2680us 1.3760us 3.9680us cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams)
0.03% 131.33us 90 1.4590us 1.2160us 2.6560us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZZZNS0_19addcmul_cuda_kernelERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlfffE_EEvS4_RKT_EUliE_EEviT1_
0.03% 129.47us 90 1.4380us 1.0880us 2.8800us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZNS0_23gpu_kernel_with_scalarsIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS5_RKT_EUlfE0_EEvS5_SB_EUliE_EEviT1_
0.03% 112.54us 90 1.2500us 1.0560us 2.4960us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZNS0_23gpu_kernel_with_scalarsIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS5_RKT_EUlfE0_EEvS5_SD_EUliE_EEviT1_
0.03% 108.00us 10 10.799us 10.624us 10.976us at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::krn_partial_segment_offset(long*, long const *, long const *, long const *, long)
0.02% 107.55us 10 10.755us 10.496us 11.392us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__scan::ScanAgent<thrust::device_ptr<long>, thrust::device_ptr<long>, thrust::plus<long>, int, long, thrust::detail::integral_constant<bool, bool=0>>, thrust::device_ptr<long>, thrust::device_ptr<long>, thrust::plus<long>, int, thrust::cuda_cub::cub::ScanTileState<long, bool=1>, thrust::cuda_cub::__scan::AddInitToExclusiveScan<long, thrust::plus<long>>>(thrust::device_ptr<long>, thrust::device_ptr<long>, long, thrust::plus<long>, int, long)
0.02% 88.159us 10 8.8150us 8.6400us 9.0240us _ZN2at6native13reduce_kernelILi512ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT0_
0.02% 86.304us 10 8.6300us 8.1920us 9.0240us void at::native::reduce_kernel<int=512, at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, int=4>>(float)
0.02% 83.262us 60 1.3870us 1.1190us 2.6560us cudnn::maxwell::gemm::computeBOffsetsKernel(cudnn::maxwell::gemm::ComputeBOffsetsParams)
0.01% 62.240us 30 2.0740us 1.3760us 3.0720us cudnn::maxwell::gemm::computeWgradOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams)
0.01% 52.639us 12 4.3860us 864ns 35.520us [CUDA memcpy HtoD]
0.01% 50.592us 30 1.6860us 1.4080us 1.9200us void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
0.01% 41.088us 20 2.0540us 1.2480us 2.9440us _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZNS0_23gpu_kernel_with_scalarsIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS5_RKT_EUlfE0_EEvS5_SB_EUliE0_EEviT1_
0.01% 33.984us 30 1.1320us 960ns 2.4640us [CUDA memcpy DtoH]
0.01% 27.776us 10 2.7770us 2.6560us 2.9120us _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZNS0_23gpu_kernel_with_scalarsIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS5_RKT_EUlfE_EEvS5_SD_EUliE0_EEviT1_
0.01% 27.552us 10 2.7550us 2.5920us 3.1360us _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS4_RKT_EUliE0_EEviT1_
0.01% 26.720us 10 2.6720us 2.4960us 2.9760us _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZZZNS0_15neg_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlfE_EEvS4_RKT_EUliE0_EEviT1_
0.01% 26.016us 10 2.6010us 2.4960us 2.7520us at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::krn_partials_per_segment(long*, long const *, long, long)
0.01% 24.704us 10 2.4700us 2.3680us 2.5600us _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlvE_EEvS4_RKT_EUliE0_EEviT1_
0.00% 18.816us 10 1.8810us 1.6000us 2.4000us void kernelPointwiseApply2<TensorPowOp<float, int=2>, float, float, unsigned int, int=1, int=1>(OffsetInfo<int=2, float, float>, OffsetInfo<TensorPowOp<float, int=2>, float, unsigned int>, float, float)
0.00% 17.344us 10 1.7340us 1.6640us 1.9200us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZZZNS0_15div_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE0_clEvEUlfE_EEvS4_RKT_EUliE_EEviT1_
0.00% 16.576us 10 1.6570us 1.5680us 1.7600us void kernelPointwiseApply2<TensorPowOp<float, int=1>, float, float, unsigned int, int=1, int=1>(OffsetInfo<int=1, float, float>, OffsetInfo<TensorPowOp<float, int=1>, float, unsigned int>, float, float)
0.00% 14.624us 10 1.4620us 1.4080us 1.5680us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__scan::InitAgent<thrust::cuda_cub::cub::ScanTileState<long, bool=1>, int>, thrust::cuda_cub::cub::ScanTileState<long, bool=1>, int>(bool=1, thrust::cuda_cub::cub::ScanTileState<long, bool=1>)
0.00% 12.384us 10 1.2380us 1.2160us 1.3120us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__unique_by_key::InitAgent<thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*, int>, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long, int*>(bool=1, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*)
API calls: 46.39% 3.85477s 26 148.26ms 4.9060us 3.85164s cudaMalloc
44.98% 3.73799s 20 186.90ms 6.2020us 3.73784s cudaDeviceSynchronize
4.69% 390.10ms 69 5.6537ms 1.6030us 39.560ms cudaStreamSynchronize
3.55% 295.34ms 5 59.068ms 921ns 295.33ms cudaFree
0.19% 16.143ms 1431 11.281us 5.7860us 127.83us cudaLaunchKernel
0.03% 2.3127ms 6549 353ns 227ns 15.723us cudaGetDevice
0.02% 1.8191ms 4 454.78us 397.03us 498.10us cudaGetDeviceProperties
0.02% 1.7329ms 378 4.5840us 120ns 226.29us cuDeviceGetAttribute
0.02% 1.5570ms 3280 474ns 250ns 18.329us cudaSetDevice
0.02% 1.4655ms 158 9.2750us 2.6380us 47.200us cudaMemsetAsync
0.02% 1.3464ms 3 448.81us 22.396us 1.2946ms cudaMemcpy
0.01% 1.1193ms 59 18.970us 7.4570us 160.37us cudaMemcpyAsync
0.01% 1.1062ms 2 553.08us 110.49us 995.66us cudaHostAlloc
0.01% 891.47us 4 222.87us 207.85us 246.84us cuDeviceTotalMem
0.01% 721.68us 16 45.105us 15.948us 379.98us cudaStreamCreateWithFlags
0.00% 402.34us 8 50.292us 11.015us 279.55us cudaStreamCreateWithPriority
0.00% 289.82us 100 2.8980us 1.9730us 13.529us cudaFuncGetAttributes
0.00% 277.51us 1551 178ns 91ns 1.1910us cudaGetLastError
0.00% 193.34us 120 1.6110us 1.0500us 4.0170us cudaEventRecord
0.00% 156.79us 4 39.197us 32.734us 54.488us cuDeviceGetName
0.00% 133.90us 329 406ns 218ns 18.548us cudaDeviceGetAttribute
0.00% 69.315us 72 962ns 374ns 17.370us cudaEventCreateWithFlags
0.00% 59.417us 520 114ns 92ns 284ns cudaPeekAtLastError
0.00% 42.002us 30 1.4000us 798ns 12.811us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
0.00% 7.9490us 29 274ns 99ns 1.6330us cudaGetDeviceCount
0.00% 5.2270us 2 2.6130us 1.9590us 3.2680us cuDeviceGetPCIBusId
0.00% 4.0520us 2 2.0260us 1.6980us 2.3540us cudaHostGetDevicePointer
0.00% 2.6600us 2 1.3300us 1.2800us 1.3800us cudaDeviceGetStreamPriorityRange
0.00% 2.4270us 6 404ns 154ns 1.0610us cuDeviceGet
0.00% 1.7530us 1 1.7530us 1.7530us 1.7530us cuInit
0.00% 1.5940us 4 398ns 207ns 679ns cuDeviceGetCount
0.00% 864ns 4 216ns 169ns 269ns cuDeviceGetUuid
0.00% 789ns 1 789ns 789ns 789ns cuDriverGetVersion
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment