Skip to content

Instantly share code, notes, and snippets.

@kevinrobinson
Created December 16, 2015 19:37
Show Gist options
  • Save kevinrobinson/60b9364f83ffe44a7aaa to your computer and use it in GitHub Desktop.
Save kevinrobinson/60b9364f83ffe44a7aaa to your computer and use it in GitHub Desktop.
struct LaunchConvOp<GPUDevice, T> {
static void launch(OpKernelContext* ctx, bool use_cudnn, const Tensor& input_param, const Tensor& filter, int stride, const Eigen::PaddingType& padding, Tensor* output) {
auto* stream = ctx->op_device_context<GPUDeviceContext>()->stream();
// First, we check if the CUDA platform is registered, and fall back to using Eigen on the GPU if not.
if (use_cudnn) {
Tensor input = input_param;
if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1) {
// ... 1x1 filter, so call cublas directly ...
bool blas_launch_status = stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr, n, a_ptr, k, 0.0f, &c_ptr, n).ok();
return;
}
// ...??? add some padding to align memory access
// We're going to use a CUDA-optimized convolution method that can perform
// a larger batch of work at once.
// Describe the input and output parameters.
perftools::gputools::dnn::BatchDescriptor input_desc;
input_desc.set_count(input.dim_size(0))
.set_height(input.dim_size(1))
.set_width(input.dim_size(2))
.set_feature_map_count(input.dim_size(3))
.set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
perftools::gputools::dnn::BatchDescriptor output_desc;
output_desc.set_count(output->dim_size(0))
.set_height(output->dim_size(1))
.set_width(output->dim_size(2))
.set_feature_map_count(output->dim_size(3))
.set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
// Describe the convolution
perftools::gputools::dnn::FilterDescriptor filter_desc;
filter_desc.set_input_filter_height(filter.dim_size(0))
.set_input_filter_width(filter.dim_size(1))
.set_input_feature_map_count(filter.dim_size(2))
.set_output_feature_map_count(filter.dim_size(3));
perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
conv_desc.set_vertical_filter_stride(stride)
.set_horizontal_filter_stride(stride);
// ... allocate memory and transform some shapes ...
// Launch the convolution operation on the CUDA GPU.
// This will delegate in turn to the DnnSupport plugin.
bool cudnn_launch_status = stream->ThenConvolve(input_desc, input_ptr, filter_desc, filter_ptr, conv_desc, output_desc, &output_ptr).ok();
if (!cudnn_launch_status) {
ctx->SetStatus(errors::Internal("cuDNN launch failure : input shape(", input.shape().DebugString(),") filter shape(", filter.shape().DebugString(), ")"));
}
} else {
// ... fallback to Eigen ...
}
}
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment