Skip to content

Instantly share code, notes, and snippets.

@malfet
Last active October 26, 2022 06:52
Show Gist options
  • Save malfet/8ed6e5906a6ec7b9c6d779b27aa49a0e to your computer and use it in GitHub Desktop.
Save malfet/8ed6e5906a6ec7b9c6d779b27aa49a0e to your computer and use it in GitHub Desktop.
// If linked against cudnn dynamically, i.e. using following command:
// g++ cudnn-xmma.cpp -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcudnn -lcudart
// Then nvprof shows following kernels executed on RTX2080:
// Type Time(%) Time Calls Avg Min Max Name
// GPU activities: 64.57% 20.416us 1 20.416us 20.416us 20.416us void xmma_cudnn::gemm::kernel<xmma_cudnn::implicit_gemm::fprop::Kernel_traits<xmma_cudnn::Turing_hmma_fp32_traits, xmma_cudnn::Cta_tile<xmma_cudnn::Turing, int=64, int=32, int=64, int=2, int=1, int=2, int=1, int=1>, xmma_cudnn::implicit_gemm::fprop::Gmem_tile_a_t<xmma_cudnn::Turing_hmma_fp32_traits, xmma_cudnn::Cta_tile<xmma_cudnn::Turing, int=64, int=32, int=64, int=2, int=1, int=2, int=1, int=1>, xmma_cudnn::implicit_gemm::Input_related<int=0, int=0, int=0, bool=0>, int=16, bool=0, xmma_cudnn::implicit_gemm::fprop::Gmem_tile_base_a<xmma_cudnn::Turing_hmma_fp32_traits, xmma_cudnn::Cta_tile<xmma_cudnn::Turing, int=64, int=32, int=64, int=2, int=1, int=2, int=1, int=1>, xmma_cudnn::implicit_gemm::Input_related<int=0, int=0, int=0, bool=0>, int=16, xmma_cudnn::Row, int=64, int=64>>, xmma_cudnn::implicit_gemm::fprop::Gmem_tile_c_t<xmma_cudnn::Turing_hmma_fp32_traits, xmma_cudnn::Cta_tile<xmma_cudnn::Turing, int=64, int=32, int=64, int=2, int=1, int=2, int=1, int=1>, int=16, xmma_cudnn::Fragment_c<xmma_cudnn::Turing_hmma_fp32_traits, xmma_cudnn::Cta_tile<xmma_cudnn::Turing, int=64, int=32, int=64, int=2, int=1, int=2, int=1, int=1>, bool=1>, bool=0>, xmma_cudnn::implicit_gemm::Input_related<int=0, int=0, int=0, bool=0>, int=1>>(xmma_cudnn::Turing_hmma_fp32_traitsParams)
// 32.19% 10.176us 4 2.5440us 2.4000us 2.9760us [CUDA memset]
// 3.24% 1.0240us 1 1.0240us 1.0240us 1.0240us [CUDA memcpy HtoD]
// But if linked statically, i.e. using:
// g++ cudnn-xmma.cpp -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcudnn_static -lcublas -lculibos -lcudart -lpthread -ldl
// Then nvprof outputs following on the same GPU:
//
// Type Time(%) Time Calls Avg Min Max Name
// GPU activities: 77.23% 55.584us 1 55.584us 55.584us 55.584us volta_fp16_s884cudnn_fp16_256x64_ldg8_splitK_relu_f2f_exp_small_nhwc_tn_v1
// 19.08% 13.728us 6 2.2880us 1.1840us 2.7200us [CUDA memset]
// 3.69% 2.6560us 1 2.6560us 2.6560us 2.6560us void cask_cudnn::computeOffsetsKernel<bool=0, bool=1>(cask_cudnn::ComputeOffsetsParams)
#include <cudnn.h>
#include <cuda_runtime_api.h>
#include <iostream>
#include <cstdlib>
#define checkCUDNN(expression) \
{ \
cudnnStatus_t status = (expression); \
if (status != CUDNN_STATUS_SUCCESS) { \
std::cerr << "Error on line " << __LINE__ << ": " \
<< cudnnGetErrorString(status) << std::endl; \
std::exit(EXIT_FAILURE); \
} \
}
int main(int argc, char const *argv[]) {
const float alpha = 1, beta = 0;
float* d_kernel{nullptr};
float* d_input{nullptr};
float* d_output{nullptr};
void* d_workspace{nullptr};
size_t workspace_bytes = 196992;
constexpr int batch_size = 1;
constexpr int in_channels = 48;
constexpr int out_channels = 64;
constexpr int kernel_width = 5;
constexpr int kernel_height = 5;
constexpr int in_height = 38;
constexpr int in_width = 38;
constexpr int out_height = 38;
constexpr int out_width = 38;
constexpr auto data_type = CUDNN_DATA_HALF;
constexpr auto format = CUDNN_TENSOR_NHWC;
cudaMalloc(&d_input, batch_size*in_channels*in_width*in_height*2);
cudaMalloc(&d_output, batch_size*out_channels*out_width*out_height*2);
cudaMalloc(&d_kernel, out_channels*in_channels*kernel_width*kernel_height*2);
cudnnHandle_t cudnn;
cudnnTensorDescriptor_t input_descriptor;
cudnnTensorDescriptor_t output_descriptor;
cudnnFilterDescriptor_t kernel_descriptor;
cudnnConvolutionDescriptor_t convolution_descriptor;
cudnnConvolutionFwdAlgo_t convolution_algorithm = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
cudnnConvolutionBwdFilterAlgo_t backward_filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
checkCUDNN(cudnnCreate(&cudnn));
checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor));
checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor,
format,
data_type,
batch_size,
in_channels,
in_height,
in_width));
checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor));
checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor,
format,
data_type,
batch_size,
out_channels,
out_height,
out_width));
checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor));
checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor,
data_type,
format,
out_channels,
in_channels,
kernel_height,
kernel_width));
checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor));
checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor,
/*pad_height=*/2,
/*pad_width=*/2,
/*vertical_stride=*/1,
/*horizontal_stride=*/1,
/*dilation_height=*/1,
/*dilation_width=*/1,
/*mode=*/CUDNN_CROSS_CORRELATION,
/*computeType=*/CUDNN_DATA_FLOAT));
checkCUDNN(cudnnSetConvolutionMathType(convolution_descriptor, CUDNN_TENSOR_OP_MATH));
cudaMalloc(&d_workspace, workspace_bytes);
checkCUDNN(cudnnConvolutionForward(cudnn,
&alpha,
input_descriptor,
d_input,
kernel_descriptor,
d_kernel,
convolution_descriptor,
convolution_algorithm,
d_workspace,
workspace_bytes,
&beta,
output_descriptor,
d_output));
cudaDeviceSynchronize();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment