heiner · September 10, 2019 16:12
diff --git a/nvprof.output.txt b/nvprof.output.txt
 ==50689== NVPROF is profiling process 50689, command: python simpleembed.py
 ==50689== Profiling application: python simpleembed.py
 ==50689== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   57.66%  248.18ms        30  8.2726ms  5.6467ms  13.019ms  maxwell_scudnn_128x32_stridedB_splitK_interior_nn
                   16.02%  68.946ms        30  2.2982ms  1.0702ms  4.6368ms  void calc_bias_diff<int=2, float, float, int=128, int=0>(cudnnTensorStruct, float const *, cudnnTensorStruct, float*, float, float, int)
                    4.22%  18.172ms        30  605.73us  2.6240us  945.47us  _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZNS0_16copy_kernel_implIffEEvRNS_14TensorIteratorEEUlfE_EEvS5_RKT_EUliE0_EEviT1_
                    3.22%  13.858ms        30  461.94us  100.42us  1.1248ms  maxwell_scudnn_128x32_stridedB_small_nn
                    3.09%  13.316ms        30  443.88us  113.18us  1.0100ms  maxwell_scudnn_128x32_relu_interior_nn
                    2.45%  10.565ms       100  105.65us  100.67us  113.41us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__merge_sort::MergeAgent<thrust::device_ptr<long>, thrust::device_ptr<long>, long, ThrustLTOp<long, bool=0>, thrust::detail::integral_constant<bool, bool=1>>, bool, thrust::device_ptr<long>, thrust::device_ptr<long>, long, long*, long*, ThrustLTOp<long, bool=0>, long*, long>(thrust::device_ptr<long>, thrust::device_ptr<long>, long, long, bool=0, ThrustLTOp<long, bool=0>, bool, bool=1, thrust::detail::integral_constant<bool, bool=1>)
                    2.45%  10.524ms        60  175.40us  15.904us  546.14us  _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZNS0_21threshold_kernel_implIfEEvRNS_14TensorIteratorET_S6_EUlffE_EEvS5_RKS6_EUliE_EEviT1_
                    2.24%  9.6553ms        10  965.53us  965.05us  966.24us  void at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::compute_grad_weight<float>(long*, float*, at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::compute_grad_weight<float>, long, long, at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::compute_grad_weight<float>, long, at::AccumulateType<long*, bool=1>::type*, int, long)
                    2.10%  9.0427ms        10  904.27us  903.74us  905.44us  void at::native::_GLOBAL__N__63_tmpxft_00007a48_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::MaxPoolBackward<float, float>(int, float const *, long const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00007a48_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::MaxPoolBackward<float, float>*)
                    1.75%  7.5375ms        10  753.75us  752.32us  755.64us  void indexSelectLargeIndex<float, unsigned int, int=2, int=2, int=-2, bool=1>(TensorInfo<float, unsigned int>, TensorInfo<float, unsigned int>, TensorInfo<long, unsigned int>, int, int, unsigned int, unsigned int, long)
                    1.62%  6.9521ms        30  231.74us  30.080us  598.91us  void op_generic_tensor_kernel<int=2, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, dimArray, reducedDivisorArray, bool)
                    0.73%  3.1329ms        10  313.29us  312.32us  314.11us  void at::native::_GLOBAL__N__63_tmpxft_00007a48_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::MaxPoolForward<float, float>(int, float const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00007a48_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::MaxPoolForward<float, float>*, long*)
                    0.70%  3.0293ms        10  302.93us  300.70us  308.26us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__merge_sort::BlockSortAgent<thrust::device_ptr<long>, thrust::device_ptr<long>, long, ThrustLTOp<long, bool=0>, thrust::detail::integral_constant<bool, bool=1>, thrust::detail::integral_constant<bool, bool=0>>, bool, thrust::device_ptr<long>, thrust::device_ptr<long>, long, long*, long*, ThrustLTOp<long, bool=0>>(thrust::device_ptr<long>, thrust::device_ptr<long>, long, long, bool=0, ThrustLTOp<long, bool=0>, bool)
                    0.40%  1.7338ms       100  17.338us  9.1200us  23.104us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__merge_sort::PartitionAgent<thrust::device_ptr<long>, long, ThrustLTOp<long, bool=0>>, bool, thrust::device_ptr<long>, long*, long, unsigned long, long*, ThrustLTOp<long, bool=0>, long, int>(thrust::device_ptr<long>, long, long, bool=0, ThrustLTOp<long, bool=0>, thrust::cuda_cub::__merge_sort::PartitionAgent<thrust::device_ptr<long>, long, ThrustLTOp<long, bool=0>>, bool, thrust::device_ptr<long>, long*)
                    0.25%  1.0856ms        10  108.56us  108.03us  109.38us  _ZN84_GLOBAL__N__60_tmpxft_00007aab_00000000_11_Distributions_compute_75_cpp1_ii_c3aa7ee643distribution_elementwise_grid_stride_kernelImLi2EZZZN2at6native18random_kernel_cudaERNS1_14TensorIteratorEmlPNS1_9GeneratorEENKUlvE_clEvENKUlvE4_clEvEUlP24curandStatePhilox4_32_10E_ZNS_27distribution_nullary_kernelIlmLi2ESB_ZZZNS2_18random_kernel_cudaES4_mlS6_ENKS7_clEvENKS8_clEvEUlmE_EEvS4_PNS1_13CUDAGeneratorERKT2_T3_EUlimE_EEviSt4pairImmET1_SG_
                    0.16%  695.42us        20  34.770us  17.376us  51.776us  [CUDA memcpy DtoD]
                    0.13%  546.30us        10  54.629us  52.927us  55.520us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__unique_by_key::UniqueByKeyAgent<thrust::device_ptr<long>, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::device_ptr<long>, thrust::equal_to<long>, int, int*>, thrust::device_ptr<long>, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::device_ptr<long>, thrust::equal_to<long>, int*, int, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long>(thrust::device_ptr<long>, int, thrust::use_default, thrust::use_default, thrust::use_default, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::device_ptr<long>, long)
                    0.07%  312.13us        10  31.212us  30.880us  31.904us  void at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::sum_and_scatter<float>(long*, float*, long, at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::sum_and_scatter<float>, long, at::AccumulateType<long*, bool=1>::type const *, long const *, long, long)
                    0.07%  290.94us        10  29.094us  28.224us  29.535us  sgemm_32x32x32_NT_vec
                    0.07%  286.56us        10  28.655us  28.096us  29.248us  sgemm_32x32x32_TN_vec
                    0.05%  235.74us        10  23.574us  23.168us  24.000us  maxwell_sgemm_128x64_nn
                    0.05%  229.12us        10  22.911us  22.624us  23.232us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<long>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<long>, thrust::cuda_cub::__transform::always_true_predicate>, long>(thrust::use_default, thrust::use_default)
                    0.04%  183.04us        90  2.0330us  1.5040us  4.6720us  _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZZZNS0_19addcdiv_cuda_kernelERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlfffE_EEvS4_RKT_EUliE_EEviT1_
                    0.04%  166.43us       158  1.0530us     768ns  3.7120us  [CUDA memset]
                    0.04%  156.42us        81  1.9310us  1.2160us  3.5200us  _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS4_RKT_EUliE_EEviT1_
                    0.03%  138.37us        90  1.5370us  1.1840us  15.008us  void kernelPointwiseApply2<Tensor_sqrt_Float_Op, float, float, unsigned int, int=1, int=1>(OffsetInfo<float, unsigned int, int=1>, OffsetInfo<float, unsigned int, int=1>, unsigned int, Tensor_sqrt_Float_Op)
                    0.03%  136.13us        60  2.2680us  1.3760us  3.9680us  cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams)
                    0.03%  131.33us        90  1.4590us  1.2160us  2.6560us  _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZZZNS0_19addcmul_cuda_kernelERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlfffE_EEvS4_RKT_EUliE_EEviT1_
                    0.03%  129.47us        90  1.4380us  1.0880us  2.8800us  _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZNS0_23gpu_kernel_with_scalarsIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS5_RKT_EUlfE0_EEvS5_SB_EUliE_EEviT1_
                    0.03%  112.54us        90  1.2500us  1.0560us  2.4960us  _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZNS0_23gpu_kernel_with_scalarsIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS5_RKT_EUlfE0_EEvS5_SD_EUliE_EEviT1_
                    0.03%  108.00us        10  10.799us  10.624us  10.976us  at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::krn_partial_segment_offset(long*, long const *, long const *, long const *, long)
                    0.02%  107.55us        10  10.755us  10.496us  11.392us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__scan::ScanAgent<thrust::device_ptr<long>, thrust::device_ptr<long>, thrust::plus<long>, int, long, thrust::detail::integral_constant<bool, bool=0>>, thrust::device_ptr<long>, thrust::device_ptr<long>, thrust::plus<long>, int, thrust::cuda_cub::cub::ScanTileState<long, bool=1>, thrust::cuda_cub::__scan::AddInitToExclusiveScan<long, thrust::plus<long>>>(thrust::device_ptr<long>, thrust::device_ptr<long>, long, thrust::plus<long>, int, long)
                    0.02%  88.159us        10  8.8150us  8.6400us  9.0240us  _ZN2at6native13reduce_kernelILi512ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT0_
                    0.02%  86.304us        10  8.6300us  8.1920us  9.0240us  void at::native::reduce_kernel<int=512, at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, int=4>>(float)
                    0.02%  83.262us        60  1.3870us  1.1190us  2.6560us  cudnn::maxwell::gemm::computeBOffsetsKernel(cudnn::maxwell::gemm::ComputeBOffsetsParams)
                    0.01%  62.240us        30  2.0740us  1.3760us  3.0720us  cudnn::maxwell::gemm::computeWgradOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams)
                    0.01%  52.639us        12  4.3860us     864ns  35.520us  [CUDA memcpy HtoD]
                    0.01%  50.592us        30  1.6860us  1.4080us  1.9200us  void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
                    0.01%  41.088us        20  2.0540us  1.2480us  2.9440us  _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZNS0_23gpu_kernel_with_scalarsIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS5_RKT_EUlfE0_EEvS5_SB_EUliE0_EEviT1_
                    0.01%  33.984us        30  1.1320us     960ns  2.4640us  [CUDA memcpy DtoH]
                    0.01%  27.776us        10  2.7770us  2.6560us  2.9120us  _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZNS0_23gpu_kernel_with_scalarsIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS5_RKT_EUlfE_EEvS5_SD_EUliE0_EEviT1_
                    0.01%  27.552us        10  2.7550us  2.5920us  3.1360us  _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS4_RKT_EUliE0_EEviT1_
                    0.01%  26.720us        10  2.6720us  2.4960us  2.9760us  _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZZZNS0_15neg_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlfE_EEvS4_RKT_EUliE0_EEviT1_
                    0.01%  26.016us        10  2.6010us  2.4960us  2.7520us  at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::krn_partials_per_segment(long*, long const *, long, long)
                    0.01%  24.704us        10  2.4700us  2.3680us  2.5600us  _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlvE_EEvS4_RKT_EUliE0_EEviT1_
                    0.00%  18.816us        10  1.8810us  1.6000us  2.4000us  void kernelPointwiseApply2<TensorPowOp<float, int=2>, float, float, unsigned int, int=1, int=1>(OffsetInfo<int=2, float, float>, OffsetInfo<TensorPowOp<float, int=2>, float, unsigned int>, float, float)
                    0.00%  17.344us        10  1.7340us  1.6640us  1.9200us  _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZZZNS0_15div_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE0_clEvEUlfE_EEvS4_RKT_EUliE_EEviT1_
                    0.00%  16.576us        10  1.6570us  1.5680us  1.7600us  void kernelPointwiseApply2<TensorPowOp<float, int=1>, float, float, unsigned int, int=1, int=1>(OffsetInfo<int=1, float, float>, OffsetInfo<TensorPowOp<float, int=1>, float, unsigned int>, float, float)
                    0.00%  14.624us        10  1.4620us  1.4080us  1.5680us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__scan::InitAgent<thrust::cuda_cub::cub::ScanTileState<long, bool=1>, int>, thrust::cuda_cub::cub::ScanTileState<long, bool=1>, int>(bool=1, thrust::cuda_cub::cub::ScanTileState<long, bool=1>)
                    0.00%  12.384us        10  1.2380us  1.2160us  1.3120us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__unique_by_key::InitAgent<thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*, int>, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long, int*>(bool=1, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*)
      API calls:   46.39%  3.85477s        26  148.26ms  4.9060us  3.85164s  cudaMalloc
                   44.98%  3.73799s        20  186.90ms  6.2020us  3.73784s  cudaDeviceSynchronize
                    4.69%  390.10ms        69  5.6537ms  1.6030us  39.560ms  cudaStreamSynchronize
                    3.55%  295.34ms         5  59.068ms     921ns  295.33ms  cudaFree
                    0.19%  16.143ms      1431  11.281us  5.7860us  127.83us  cudaLaunchKernel
                    0.03%  2.3127ms      6549     353ns     227ns  15.723us  cudaGetDevice
                    0.02%  1.8191ms         4  454.78us  397.03us  498.10us  cudaGetDeviceProperties
                    0.02%  1.7329ms       378  4.5840us     120ns  226.29us  cuDeviceGetAttribute
                    0.02%  1.5570ms      3280     474ns     250ns  18.329us  cudaSetDevice
                    0.02%  1.4655ms       158  9.2750us  2.6380us  47.200us  cudaMemsetAsync
                    0.02%  1.3464ms         3  448.81us  22.396us  1.2946ms  cudaMemcpy
                    0.01%  1.1193ms        59  18.970us  7.4570us  160.37us  cudaMemcpyAsync
                    0.01%  1.1062ms         2  553.08us  110.49us  995.66us  cudaHostAlloc
                    0.01%  891.47us         4  222.87us  207.85us  246.84us  cuDeviceTotalMem
                    0.01%  721.68us        16  45.105us  15.948us  379.98us  cudaStreamCreateWithFlags
                    0.00%  402.34us         8  50.292us  11.015us  279.55us  cudaStreamCreateWithPriority
                    0.00%  289.82us       100  2.8980us  1.9730us  13.529us  cudaFuncGetAttributes
                    0.00%  277.51us      1551     178ns      91ns  1.1910us  cudaGetLastError
                    0.00%  193.34us       120  1.6110us  1.0500us  4.0170us  cudaEventRecord
                    0.00%  156.79us         4  39.197us  32.734us  54.488us  cuDeviceGetName
                    0.00%  133.90us       329     406ns     218ns  18.548us  cudaDeviceGetAttribute
                    0.00%  69.315us        72     962ns     374ns  17.370us  cudaEventCreateWithFlags
                    0.00%  59.417us       520     114ns      92ns     284ns  cudaPeekAtLastError
                    0.00%  42.002us        30  1.4000us     798ns  12.811us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
                    0.00%  7.9490us        29     274ns      99ns  1.6330us  cudaGetDeviceCount
                    0.00%  5.2270us         2  2.6130us  1.9590us  3.2680us  cuDeviceGetPCIBusId
                    0.00%  4.0520us         2  2.0260us  1.6980us  2.3540us  cudaHostGetDevicePointer
                    0.00%  2.6600us         2  1.3300us  1.2800us  1.3800us  cudaDeviceGetStreamPriorityRange
                    0.00%  2.4270us         6     404ns     154ns  1.0610us  cuDeviceGet
                    0.00%  1.7530us         1  1.7530us  1.7530us  1.7530us  cuInit
                    0.00%  1.5940us         4     398ns     207ns     679ns  cuDeviceGetCount
                    0.00%     864ns         4     216ns     169ns     269ns  cuDeviceGetUuid
                    0.00%     789ns         1     789ns     789ns     789ns  cuDriverGetVersion
	==50689== NVPROF is profiling process 50689, command: python simpleembed.py
	==50689== Profiling application: python simpleembed.py
	==50689== Profiling result:
	Type Time(%) Time Calls Avg Min Max Name
	GPU activities: 57.66% 248.18ms 30 8.2726ms 5.6467ms 13.019ms maxwell_scudnn_128x32_stridedB_splitK_interior_nn
	16.02% 68.946ms 30 2.2982ms 1.0702ms 4.6368ms void calc_bias_diff<int=2, float, float, int=128, int=0>(cudnnTensorStruct, float const , cudnnTensorStruct, float, float, float, int)
	4.22% 18.172ms 30 605.73us 2.6240us 945.47us _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZNS0_16copy_kernel_implIffEEvRNS_14TensorIteratorEEUlfE_EEvS5_RKT_EUliE0_EEviT1_
	3.22% 13.858ms 30 461.94us 100.42us 1.1248ms maxwell_scudnn_128x32_stridedB_small_nn
	3.09% 13.316ms 30 443.88us 113.18us 1.0100ms maxwell_scudnn_128x32_relu_interior_nn
	2.45% 10.565ms 100 105.65us 100.67us 113.41us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__merge_sort::MergeAgent<thrust::device_ptr<long>, thrust::device_ptr<long>, long, ThrustLTOp<long, bool=0>, thrust::detail::integral_constant<bool, bool=1>>, bool, thrust::device_ptr<long>, thrust::device_ptr<long>, long, long, long, ThrustLTOp<long, bool=0>, long*, long>(thrust::device_ptr<long>, thrust::device_ptr<long>, long, long, bool=0, ThrustLTOp<long, bool=0>, bool, bool=1, thrust::detail::integral_constant<bool, bool=1>)
	2.45% 10.524ms 60 175.40us 15.904us 546.14us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZNS0_21threshold_kernel_implIfEEvRNS_14TensorIteratorET_S6_EUlffE_EEvS5_RKS6_EUliE_EEviT1_
	2.24% 9.6553ms 10 965.53us 965.05us 966.24us void at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::compute_grad_weight<float>(long, float, at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::compute_grad_weight<float>, long, long, at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::compute_grad_weight<float>, long, at::AccumulateType<long, bool=1>::type, int, long)
	2.10% 9.0427ms 10 904.27us 903.74us 905.44us void at::native::_GLOBAL__N__63_tmpxft_00007a48_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::MaxPoolBackward<float, float>(int, float const , long const , int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00007a48_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::MaxPoolBackward<float, float>*)
	1.75% 7.5375ms 10 753.75us 752.32us 755.64us void indexSelectLargeIndex<float, unsigned int, int=2, int=2, int=-2, bool=1>(TensorInfo<float, unsigned int>, TensorInfo<float, unsigned int>, TensorInfo<long, unsigned int>, int, int, unsigned int, unsigned int, long)
	1.62% 6.9521ms 30 231.74us 30.080us 598.91us void op_generic_tensor_kernel<int=2, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, int=0>(cudnnTensorStruct, float, cudnnTensorStruct, float const , cudnnTensorStruct, float const *, float, float, float, float, dimArray, reducedDivisorArray, bool)
	0.73% 3.1329ms 10 313.29us 312.32us 314.11us void at::native::_GLOBAL__N__63_tmpxft_00007a48_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::MaxPoolForward<float, float>(int, float const , int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::native::_GLOBAL__N__63_tmpxft_00007a48_00000000_11_DilatedMaxPool2d_compute_75_cpp1_ii_db999de0::MaxPoolForward<float, float>, long*)
	0.70% 3.0293ms 10 302.93us 300.70us 308.26us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__merge_sort::BlockSortAgent<thrust::device_ptr<long>, thrust::device_ptr<long>, long, ThrustLTOp<long, bool=0>, thrust::detail::integral_constant<bool, bool=1>, thrust::detail::integral_constant<bool, bool=0>>, bool, thrust::device_ptr<long>, thrust::device_ptr<long>, long, long, long, ThrustLTOp<long, bool=0>>(thrust::device_ptr<long>, thrust::device_ptr<long>, long, long, bool=0, ThrustLTOp<long, bool=0>, bool)
	0.40% 1.7338ms 100 17.338us 9.1200us 23.104us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__merge_sort::PartitionAgent<thrust::device_ptr<long>, long, ThrustLTOp<long, bool=0>>, bool, thrust::device_ptr<long>, long, long, unsigned long, long, ThrustLTOp<long, bool=0>, long, int>(thrust::device_ptr<long>, long, long, bool=0, ThrustLTOp<long, bool=0>, thrust::cuda_cub::__merge_sort::PartitionAgent<thrust::device_ptr<long>, long, ThrustLTOp<long, bool=0>>, bool, thrust::device_ptr<long>, long*)
	0.25% 1.0856ms 10 108.56us 108.03us 109.38us _ZN84_GLOBAL__N__60_tmpxft_00007aab_00000000_11_Distributions_compute_75_cpp1_ii_c3aa7ee643distribution_elementwise_grid_stride_kernelImLi2EZZZN2at6native18random_kernel_cudaERNS1_14TensorIteratorEmlPNS1_9GeneratorEENKUlvE_clEvENKUlvE4_clEvEUlP24curandStatePhilox4_32_10E_ZNS_27distribution_nullary_kernelIlmLi2ESB_ZZZNS2_18random_kernel_cudaES4_mlS6_ENKS7_clEvENKS8_clEvEUlmE_EEvS4_PNS1_13CUDAGeneratorERKT2_T3_EUlimE_EEviSt4pairImmET1_SG_
	0.16% 695.42us 20 34.770us 17.376us 51.776us [CUDA memcpy DtoD]
	0.13% 546.30us 10 54.629us 52.927us 55.520us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__unique_by_key::UniqueByKeyAgent<thrust::device_ptr<long>, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::device_ptr<long>, thrust::equal_to<long>, int, int>, thrust::device_ptr<long>, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::device_ptr<long>, thrust::equal_to<long>, int, int, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long>(thrust::device_ptr<long>, int, thrust::use_default, thrust::use_default, thrust::use_default, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::device_ptr<long>, long)
	0.07% 312.13us 10 31.212us 30.880us 31.904us void at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::sum_and_scatter<float>(long, float, long, at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::sum_and_scatter<float>, long, at::AccumulateType<long, bool=1>::type const , long const *, long, long)
	0.07% 290.94us 10 29.094us 28.224us 29.535us sgemm_32x32x32_NT_vec
	0.07% 286.56us 10 28.655us 28.096us 29.248us sgemm_32x32x32_TN_vec
	0.05% 235.74us 10 23.574us 23.168us 24.000us maxwell_sgemm_128x64_nn
	0.05% 229.12us 10 22.911us 22.624us 23.232us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<long>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<long>, thrust::cuda_cub::__transform::always_true_predicate>, long>(thrust::use_default, thrust::use_default)
	0.04% 183.04us 90 2.0330us 1.5040us 4.6720us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZZZNS0_19addcdiv_cuda_kernelERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlfffE_EEvS4_RKT_EUliE_EEviT1_
	0.04% 166.43us 158 1.0530us 768ns 3.7120us [CUDA memset]
	0.04% 156.42us 81 1.9310us 1.2160us 3.5200us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS4_RKT_EUliE_EEviT1_
	0.03% 138.37us 90 1.5370us 1.1840us 15.008us void kernelPointwiseApply2<Tensor_sqrt_Float_Op, float, float, unsigned int, int=1, int=1>(OffsetInfo<float, unsigned int, int=1>, OffsetInfo<float, unsigned int, int=1>, unsigned int, Tensor_sqrt_Float_Op)
	0.03% 136.13us 60 2.2680us 1.3760us 3.9680us cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams)
	0.03% 131.33us 90 1.4590us 1.2160us 2.6560us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZZZNS0_19addcmul_cuda_kernelERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlfffE_EEvS4_RKT_EUliE_EEviT1_
	0.03% 129.47us 90 1.4380us 1.0880us 2.8800us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZNS0_23gpu_kernel_with_scalarsIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS5_RKT_EUlfE0_EEvS5_SB_EUliE_EEviT1_
	0.03% 112.54us 90 1.2500us 1.0560us 2.4960us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZNS0_23gpu_kernel_with_scalarsIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS5_RKT_EUlfE0_EEvS5_SD_EUliE_EEviT1_
	0.03% 108.00us 10 10.799us 10.624us 10.976us at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::krn_partial_segment_offset(long, long const , long const , long const , long)
	0.02% 107.55us 10 10.755us 10.496us 11.392us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__scan::ScanAgent<thrust::device_ptr<long>, thrust::device_ptr<long>, thrust::plus<long>, int, long, thrust::detail::integral_constant<bool, bool=0>>, thrust::device_ptr<long>, thrust::device_ptr<long>, thrust::plus<long>, int, thrust::cuda_cub::cub::ScanTileState<long, bool=1>, thrust::cuda_cub::__scan::AddInitToExclusiveScan<long, thrust::plus<long>>>(thrust::device_ptr<long>, thrust::device_ptr<long>, long, thrust::plus<long>, int, long)
	0.02% 88.159us 10 8.8150us 8.6400us 9.0240us _ZN2at6native13reduce_kernelILi512ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_15sum_kernel_implIfffEEvRNS_14TensorIteratorEEUlffE_EEjfLi4EEEEEvT0_
	0.02% 86.304us 10 8.6300us 8.1920us 9.0240us void at::native::reduce_kernel<int=512, at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, int=4>>(float)
	0.02% 83.262us 60 1.3870us 1.1190us 2.6560us cudnn::maxwell::gemm::computeBOffsetsKernel(cudnn::maxwell::gemm::ComputeBOffsetsParams)
	0.01% 62.240us 30 2.0740us 1.3760us 3.0720us cudnn::maxwell::gemm::computeWgradOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams)
	0.01% 52.639us 12 4.3860us 864ns 35.520us [CUDA memcpy HtoD]
	0.01% 50.592us 30 1.6860us 1.4080us 1.9200us void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
	0.01% 41.088us 20 2.0540us 1.2480us 2.9440us _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZNS0_23gpu_kernel_with_scalarsIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS5_RKT_EUlfE0_EEvS5_SB_EUliE0_EEviT1_
	0.01% 33.984us 30 1.1320us 960ns 2.4640us [CUDA memcpy DtoH]
	0.01% 27.776us 10 2.7770us 2.6560us 2.9120us _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZNS0_23gpu_kernel_with_scalarsIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS5_RKT_EUlfE_EEvS5_SD_EUliE0_EEviT1_
	0.01% 27.552us 10 2.7550us 2.5920us 3.1360us _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZZZNS0_15mul_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlffE_EEvS4_RKT_EUliE0_EEviT1_
	0.01% 26.720us 10 2.6720us 2.4960us 2.9760us _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZZZNS0_15neg_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE2_clEvEUlfE_EEvS4_RKT_EUliE0_EEviT1_
	0.01% 26.016us 10 2.6010us 2.4960us 2.7520us at::native::_GLOBAL__N__70_tmpxft_00007ce6_00000000_11_EmbeddingBackwardKernel_compute_75_cpp1_ii_4a9f87d3::krn_partials_per_segment(long, long const , long, long)
	0.01% 24.704us 10 2.4700us 2.3680us 2.5600us _ZN2at6native18elementwise_kernelILi128ELi4EZNS0_15gpu_kernel_implIZZZNS0_16fill_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlvE_EEvS4_RKT_EUliE0_EEviT1_
	0.00% 18.816us 10 1.8810us 1.6000us 2.4000us void kernelPointwiseApply2<TensorPowOp<float, int=2>, float, float, unsigned int, int=1, int=1>(OffsetInfo<int=2, float, float>, OffsetInfo<TensorPowOp<float, int=2>, float, unsigned int>, float, float)
	0.00% 17.344us 10 1.7340us 1.6640us 1.9200us _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZZZNS0_15div_kernel_cudaERNS_14TensorIteratorEENKUlvE_clEvENKUlvE0_clEvEUlfE_EEvS4_RKT_EUliE_EEviT1_
	0.00% 16.576us 10 1.6570us 1.5680us 1.7600us void kernelPointwiseApply2<TensorPowOp<float, int=1>, float, float, unsigned int, int=1, int=1>(OffsetInfo<int=1, float, float>, OffsetInfo<TensorPowOp<float, int=1>, float, unsigned int>, float, float)
	0.00% 14.624us 10 1.4620us 1.4080us 1.5680us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__scan::InitAgent<thrust::cuda_cub::cub::ScanTileState<long, bool=1>, int>, thrust::cuda_cub::cub::ScanTileState<long, bool=1>, int>(bool=1, thrust::cuda_cub::cub::ScanTileState<long, bool=1>)
	0.00% 12.384us 10 1.2380us 1.2160us 1.3120us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__unique_by_key::InitAgent<thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int, int>, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long, int>(bool=1, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*)
	API calls: 46.39% 3.85477s 26 148.26ms 4.9060us 3.85164s cudaMalloc
	44.98% 3.73799s 20 186.90ms 6.2020us 3.73784s cudaDeviceSynchronize
	4.69% 390.10ms 69 5.6537ms 1.6030us 39.560ms cudaStreamSynchronize
	3.55% 295.34ms 5 59.068ms 921ns 295.33ms cudaFree
	0.19% 16.143ms 1431 11.281us 5.7860us 127.83us cudaLaunchKernel
	0.03% 2.3127ms 6549 353ns 227ns 15.723us cudaGetDevice
	0.02% 1.8191ms 4 454.78us 397.03us 498.10us cudaGetDeviceProperties
	0.02% 1.7329ms 378 4.5840us 120ns 226.29us cuDeviceGetAttribute
	0.02% 1.5570ms 3280 474ns 250ns 18.329us cudaSetDevice
	0.02% 1.4655ms 158 9.2750us 2.6380us 47.200us cudaMemsetAsync
	0.02% 1.3464ms 3 448.81us 22.396us 1.2946ms cudaMemcpy
	0.01% 1.1193ms 59 18.970us 7.4570us 160.37us cudaMemcpyAsync
	0.01% 1.1062ms 2 553.08us 110.49us 995.66us cudaHostAlloc
	0.01% 891.47us 4 222.87us 207.85us 246.84us cuDeviceTotalMem
	0.01% 721.68us 16 45.105us 15.948us 379.98us cudaStreamCreateWithFlags
	0.00% 402.34us 8 50.292us 11.015us 279.55us cudaStreamCreateWithPriority
	0.00% 289.82us 100 2.8980us 1.9730us 13.529us cudaFuncGetAttributes
	0.00% 277.51us 1551 178ns 91ns 1.1910us cudaGetLastError
	0.00% 193.34us 120 1.6110us 1.0500us 4.0170us cudaEventRecord
	0.00% 156.79us 4 39.197us 32.734us 54.488us cuDeviceGetName
	0.00% 133.90us 329 406ns 218ns 18.548us cudaDeviceGetAttribute
	0.00% 69.315us 72 962ns 374ns 17.370us cudaEventCreateWithFlags
	0.00% 59.417us 520 114ns 92ns 284ns cudaPeekAtLastError
	0.00% 42.002us 30 1.4000us 798ns 12.811us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
	0.00% 7.9490us 29 274ns 99ns 1.6330us cudaGetDeviceCount
	0.00% 5.2270us 2 2.6130us 1.9590us 3.2680us cuDeviceGetPCIBusId
	0.00% 4.0520us 2 2.0260us 1.6980us 2.3540us cudaHostGetDevicePointer
	0.00% 2.6600us 2 1.3300us 1.2800us 1.3800us cudaDeviceGetStreamPriorityRange
	0.00% 2.4270us 6 404ns 154ns 1.0610us cuDeviceGet
	0.00% 1.7530us 1 1.7530us 1.7530us 1.7530us cuInit
	0.00% 1.5940us 4 398ns 207ns 679ns cuDeviceGetCount
	0.00% 864ns 4 216ns 169ns 269ns cuDeviceGetUuid
	0.00% 789ns 1 789ns 789ns 789ns cuDriverGetVersion
No results found