Last active
October 26, 2022 06:52
-
-
Save malfet/8ed6e5906a6ec7b9c6d779b27aa49a0e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// If linked against cudnn dynamically, i.e. using following command: | |
// g++ cudnn-xmma.cpp -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcudnn -lcudart | |
// Then nvprof shows following kernels executed on RTX2080: | |
// Type Time(%) Time Calls Avg Min Max Name | |
// GPU activities: 64.57% 20.416us 1 20.416us 20.416us 20.416us void xmma_cudnn::gemm::kernel<xmma_cudnn::implicit_gemm::fprop::Kernel_traits<xmma_cudnn::Turing_hmma_fp32_traits, xmma_cudnn::Cta_tile<xmma_cudnn::Turing, int=64, int=32, int=64, int=2, int=1, int=2, int=1, int=1>, xmma_cudnn::implicit_gemm::fprop::Gmem_tile_a_t<xmma_cudnn::Turing_hmma_fp32_traits, xmma_cudnn::Cta_tile<xmma_cudnn::Turing, int=64, int=32, int=64, int=2, int=1, int=2, int=1, int=1>, xmma_cudnn::implicit_gemm::Input_related<int=0, int=0, int=0, bool=0>, int=16, bool=0, xmma_cudnn::implicit_gemm::fprop::Gmem_tile_base_a<xmma_cudnn::Turing_hmma_fp32_traits, xmma_cudnn::Cta_tile<xmma_cudnn::Turing, int=64, int=32, int=64, int=2, int=1, int=2, int=1, int=1>, xmma_cudnn::implicit_gemm::Input_related<int=0, int=0, int=0, bool=0>, int=16, xmma_cudnn::Row, int=64, int=64>>, xmma_cudnn::implicit_gemm::fprop::Gmem_tile_c_t<xmma_cudnn::Turing_hmma_fp32_traits, xmma_cudnn::Cta_tile<xmma_cudnn::Turing, int=64, int=32, int=64, int=2, int=1, int=2, int=1, int=1>, int=16, xmma_cudnn::Fragment_c<xmma_cudnn::Turing_hmma_fp32_traits, xmma_cudnn::Cta_tile<xmma_cudnn::Turing, int=64, int=32, int=64, int=2, int=1, int=2, int=1, int=1>, bool=1>, bool=0>, xmma_cudnn::implicit_gemm::Input_related<int=0, int=0, int=0, bool=0>, int=1>>(xmma_cudnn::Turing_hmma_fp32_traitsParams) | |
// 32.19% 10.176us 4 2.5440us 2.4000us 2.9760us [CUDA memset] | |
// 3.24% 1.0240us 1 1.0240us 1.0240us 1.0240us [CUDA memcpy HtoD] | |
// But if linked statically, i.e. using: | |
// g++ cudnn-xmma.cpp -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcudnn_static -lcublas -lculibos -lcudart -lpthread -ldl | |
// Then nvprof outputs following on the same GPU: | |
// | |
// Type Time(%) Time Calls Avg Min Max Name | |
// GPU activities: 77.23% 55.584us 1 55.584us 55.584us 55.584us volta_fp16_s884cudnn_fp16_256x64_ldg8_splitK_relu_f2f_exp_small_nhwc_tn_v1 | |
// 19.08% 13.728us 6 2.2880us 1.1840us 2.7200us [CUDA memset] | |
// 3.69% 2.6560us 1 2.6560us 2.6560us 2.6560us void cask_cudnn::computeOffsetsKernel<bool=0, bool=1>(cask_cudnn::ComputeOffsetsParams) | |
#include <cudnn.h> | |
#include <cuda_runtime_api.h> | |
#include <iostream> | |
#include <cstdlib> | |
#define checkCUDNN(expression) \ | |
{ \ | |
cudnnStatus_t status = (expression); \ | |
if (status != CUDNN_STATUS_SUCCESS) { \ | |
std::cerr << "Error on line " << __LINE__ << ": " \ | |
<< cudnnGetErrorString(status) << std::endl; \ | |
std::exit(EXIT_FAILURE); \ | |
} \ | |
} | |
int main(int argc, char const *argv[]) { | |
const float alpha = 1, beta = 0; | |
float* d_kernel{nullptr}; | |
float* d_input{nullptr}; | |
float* d_output{nullptr}; | |
void* d_workspace{nullptr}; | |
size_t workspace_bytes = 196992; | |
constexpr int batch_size = 1; | |
constexpr int in_channels = 48; | |
constexpr int out_channels = 64; | |
constexpr int kernel_width = 5; | |
constexpr int kernel_height = 5; | |
constexpr int in_height = 38; | |
constexpr int in_width = 38; | |
constexpr int out_height = 38; | |
constexpr int out_width = 38; | |
constexpr auto data_type = CUDNN_DATA_HALF; | |
constexpr auto format = CUDNN_TENSOR_NHWC; | |
cudaMalloc(&d_input, batch_size*in_channels*in_width*in_height*2); | |
cudaMalloc(&d_output, batch_size*out_channels*out_width*out_height*2); | |
cudaMalloc(&d_kernel, out_channels*in_channels*kernel_width*kernel_height*2); | |
cudnnHandle_t cudnn; | |
cudnnTensorDescriptor_t input_descriptor; | |
cudnnTensorDescriptor_t output_descriptor; | |
cudnnFilterDescriptor_t kernel_descriptor; | |
cudnnConvolutionDescriptor_t convolution_descriptor; | |
cudnnConvolutionFwdAlgo_t convolution_algorithm = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; | |
cudnnConvolutionBwdFilterAlgo_t backward_filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; | |
checkCUDNN(cudnnCreate(&cudnn)); | |
checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor)); | |
checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor, | |
format, | |
data_type, | |
batch_size, | |
in_channels, | |
in_height, | |
in_width)); | |
checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor)); | |
checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor, | |
format, | |
data_type, | |
batch_size, | |
out_channels, | |
out_height, | |
out_width)); | |
checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor)); | |
checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor, | |
data_type, | |
format, | |
out_channels, | |
in_channels, | |
kernel_height, | |
kernel_width)); | |
checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor)); | |
checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor, | |
/*pad_height=*/2, | |
/*pad_width=*/2, | |
/*vertical_stride=*/1, | |
/*horizontal_stride=*/1, | |
/*dilation_height=*/1, | |
/*dilation_width=*/1, | |
/*mode=*/CUDNN_CROSS_CORRELATION, | |
/*computeType=*/CUDNN_DATA_FLOAT)); | |
checkCUDNN(cudnnSetConvolutionMathType(convolution_descriptor, CUDNN_TENSOR_OP_MATH)); | |
cudaMalloc(&d_workspace, workspace_bytes); | |
checkCUDNN(cudnnConvolutionForward(cudnn, | |
&alpha, | |
input_descriptor, | |
d_input, | |
kernel_descriptor, | |
d_kernel, | |
convolution_descriptor, | |
convolution_algorithm, | |
d_workspace, | |
workspace_bytes, | |
&beta, | |
output_descriptor, | |
d_output)); | |
cudaDeviceSynchronize(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment