Skip to content

Instantly share code, notes, and snippets.

@kevinrobinson
Last active December 16, 2015 18:51
Show Gist options
  • Save kevinrobinson/3aa4f105b03107d2f991 to your computer and use it in GitHub Desktop.
Save kevinrobinson/3aa4f105b03107d2f991 to your computer and use it in GitHub Desktop.
template <typename T>
struct LaunchConvOp<GPUDevice, T> {
static void launch(OpKernelContext* ctx, bool use_cudnn, const Tensor& input_param, const Tensor& filter, int stride, const Eigen::PaddingType& padding, Tensor* output) {
auto* stream = ctx->op_device_context<GPUDeviceContext>()->stream();
OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
// There's branching here for three separate paths.
// First, we check if the CUDA platform is registered, and fall back to using Eigen on the GPU if not.
if (use_cudnn) {
Tensor input = input_param;
if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1) {
// 1x1 filter, so call cublas directly
// ...
bool blas_launch_status = stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr, n, a_ptr, k, 0.0f, &c_ptr, n).ok();
if (!blas_launch_status) {
ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m, ", n=", n, ", k=", k));
}
return;
}
if (padding == Eigen::PADDING_SAME) {
// ...
// Total padding on rows and cols is
// Pr = (R' - 1) * S + Kr - R
// Pc = (C' - 1) * S + Kc - C
// where (R', C') are output dimensions, (R, C) are input dimensions, S
// is stride, (Kr, Kc) are filter dimensions.
// We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
// and Pc - Pc/2 on the bottom. When Pr or Pc is odd, this means
// we pad more on the right and bottom than on the top and left.
const int padding_rows = (out_rows - 1) * stride + patch_rows - in_rows;
const int padding_cols = (out_cols - 1) * stride + patch_cols - in_cols;
// ... allocate memory
functor::PadInput<GPUDevice, T, int>()(
ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()),
padding_rows / 2, padding_rows - padding_rows / 2, padding_cols / 2,
padding_cols - padding_cols / 2,
To32Bit(transformed_input.tensor<T, 4>()));
input = transformed_input;
}
// We're going to use a CUDA-optimized convolution method that can perform
// a larger batch of work at once.
// Describe the input and output parameters.
perftools::gputools::dnn::BatchDescriptor input_desc;
input_desc.set_count(input.dim_size(0))
.set_height(input.dim_size(1))
.set_width(input.dim_size(2))
.set_feature_map_count(input.dim_size(3))
.set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
perftools::gputools::dnn::BatchDescriptor output_desc;
output_desc.set_count(output->dim_size(0))
.set_height(output->dim_size(1))
.set_width(output->dim_size(2))
.set_feature_map_count(output->dim_size(3))
.set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
// Describe the convolution
perftools::gputools::dnn::FilterDescriptor filter_desc;
filter_desc.set_input_filter_height(filter.dim_size(0))
.set_input_filter_width(filter.dim_size(1))
.set_input_feature_map_count(filter.dim_size(2))
.set_output_feature_map_count(filter.dim_size(3));
perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
conv_desc.set_vertical_filter_stride(stride)
.set_horizontal_filter_stride(stride);
// ... allocate memory and transform some shapes ...
// Launch the convolution operation on the CUDA GPU.
// This will delegate in turn to the DnnSupport plugin.
bool cudnn_launch_status = stream->ThenConvolve(input_desc, input_ptr, filter_desc, filter_ptr, conv_desc, output_desc, &output_ptr).ok();
if (!cudnn_launch_status) {
ctx->SetStatus(errors::Internal("cuDNN launch failure : input shape(", input.shape().DebugString(),") filter shape(", filter.shape().DebugString(), ")"));
}
} else {
// ... fallback to Eigen ...
}
}
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment