kevinrobinson · December 16, 2015 18:51
diff --git a/conv_2d.cc b/conv_2d.cc
 template <typename T>
 struct LaunchConvOp<GPUDevice, T> {
  static void launch(OpKernelContext* ctx, bool use_cudnn, const Tensor& input_param, const Tensor& filter, int stride, const Eigen::PaddingType& padding, Tensor* output) {
    auto* stream = ctx->op_device_context<GPUDeviceContext>()->stream();
    OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));

    // There's branching here for three separate paths.
    // First, we check if the CUDA platform is registered, and fall back to using Eigen on the GPU if not.
    if (use_cudnn) {
      Tensor input = input_param;
      if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1) {
        // 1x1 filter, so call cublas directly
        // ... 
        bool blas_launch_status = stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr, n, a_ptr, k, 0.0f, &c_ptr, n).ok();
        if (!blas_launch_status) {
          ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m, ", n=", n, ", k=", k));
        }
        return;
      }
      if (padding == Eigen::PADDING_SAME) {
        // ... 
        // Total padding on rows and cols is
        // Pr = (R' - 1) * S + Kr - R
        // Pc = (C' - 1) * S + Kc - C
        // where (R', C') are output dimensions, (R, C) are input dimensions, S
        // is stride, (Kr, Kc) are filter dimensions.
        // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
        // and Pc - Pc/2 on the bottom.  When Pr or Pc is odd, this means
        // we pad more on the right and bottom than on the top and left.
        const int padding_rows = (out_rows - 1) * stride + patch_rows - in_rows;
        const int padding_cols = (out_cols - 1) * stride + patch_cols - in_cols;
        // ... allocate memory
        functor::PadInput<GPUDevice, T, int>()(
            ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()),
            padding_rows / 2, padding_rows - padding_rows / 2, padding_cols / 2,
            padding_cols - padding_cols / 2,
            To32Bit(transformed_input.tensor<T, 4>()));
        input = transformed_input;
      }
      
      // We're going to use a CUDA-optimized convolution method that can perform
      // a larger batch of work at once.
      // Describe the input and output parameters.
      perftools::gputools::dnn::BatchDescriptor input_desc;
      input_desc.set_count(input.dim_size(0))
          .set_height(input.dim_size(1))
          .set_width(input.dim_size(2))
          .set_feature_map_count(input.dim_size(3))
          .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
      perftools::gputools::dnn::BatchDescriptor output_desc;
      output_desc.set_count(output->dim_size(0))
          .set_height(output->dim_size(1))
          .set_width(output->dim_size(2))
          .set_feature_map_count(output->dim_size(3))
          .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
          
      // Describe the convolution
      perftools::gputools::dnn::FilterDescriptor filter_desc;
      filter_desc.set_input_filter_height(filter.dim_size(0))
          .set_input_filter_width(filter.dim_size(1))
          .set_input_feature_map_count(filter.dim_size(2))
          .set_output_feature_map_count(filter.dim_size(3));
      perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
      conv_desc.set_vertical_filter_stride(stride)
          .set_horizontal_filter_stride(stride);

      // ... allocate memory and transform some shapes ...
      // Launch the convolution operation on the CUDA GPU.
      // This will delegate in turn to the DnnSupport plugin.
      bool cudnn_launch_status = stream->ThenConvolve(input_desc, input_ptr, filter_desc, filter_ptr, conv_desc, output_desc, &output_ptr).ok();
      if (!cudnn_launch_status) {
        ctx->SetStatus(errors::Internal("cuDNN launch failure : input shape(", input.shape().DebugString(),") filter shape(", filter.shape().DebugString(), ")"));
      }
    } else {
      // ... fallback to Eigen ...
    }
  }
 };
	template <typename T>
	struct LaunchConvOp<GPUDevice, T> {
	static void launch(OpKernelContext* ctx, bool use_cudnn, const Tensor& input_param, const Tensor& filter, int stride, const Eigen::PaddingType& padding, Tensor* output) {
	auto* stream = ctx->op_device_context<GPUDeviceContext>()->stream();
	OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));

	// There's branching here for three separate paths.
	// First, we check if the CUDA platform is registered, and fall back to using Eigen on the GPU if not.
	if (use_cudnn) {
	Tensor input = input_param;
	if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1) {
	// 1x1 filter, so call cublas directly
	// ...
	bool blas_launch_status = stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr, n, a_ptr, k, 0.0f, &c_ptr, n).ok();
	if (!blas_launch_status) {
	ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m, ", n=", n, ", k=", k));
	}
	return;
	}
	if (padding == Eigen::PADDING_SAME) {
	// ...
	// Total padding on rows and cols is
	// Pr = (R' - 1) * S + Kr - R
	// Pc = (C' - 1) * S + Kc - C
	// where (R', C') are output dimensions, (R, C) are input dimensions, S
	// is stride, (Kr, Kc) are filter dimensions.
	// We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
	// and Pc - Pc/2 on the bottom. When Pr or Pc is odd, this means
	// we pad more on the right and bottom than on the top and left.
	const int padding_rows = (out_rows - 1) * stride + patch_rows - in_rows;
	const int padding_cols = (out_cols - 1) * stride + patch_cols - in_cols;
	// ... allocate memory
	functor::PadInput<GPUDevice, T, int>()(
	ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()),
	padding_rows / 2, padding_rows - padding_rows / 2, padding_cols / 2,
	padding_cols - padding_cols / 2,
	To32Bit(transformed_input.tensor<T, 4>()));
	input = transformed_input;
	}

	// We're going to use a CUDA-optimized convolution method that can perform
	// a larger batch of work at once.
	// Describe the input and output parameters.
	perftools::gputools::dnn::BatchDescriptor input_desc;
	input_desc.set_count(input.dim_size(0))
	.set_height(input.dim_size(1))
	.set_width(input.dim_size(2))
	.set_feature_map_count(input.dim_size(3))
	.set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
	perftools::gputools::dnn::BatchDescriptor output_desc;
	output_desc.set_count(output->dim_size(0))
	.set_height(output->dim_size(1))
	.set_width(output->dim_size(2))
	.set_feature_map_count(output->dim_size(3))
	.set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);

	// Describe the convolution
	perftools::gputools::dnn::FilterDescriptor filter_desc;
	filter_desc.set_input_filter_height(filter.dim_size(0))
	.set_input_filter_width(filter.dim_size(1))
	.set_input_feature_map_count(filter.dim_size(2))
	.set_output_feature_map_count(filter.dim_size(3));
	perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
	conv_desc.set_vertical_filter_stride(stride)
	.set_horizontal_filter_stride(stride);

	// ... allocate memory and transform some shapes ...
	// Launch the convolution operation on the CUDA GPU.
	// This will delegate in turn to the DnnSupport plugin.
	bool cudnn_launch_status = stream->ThenConvolve(input_desc, input_ptr, filter_desc, filter_ptr, conv_desc, output_desc, &output_ptr).ok();
	if (!cudnn_launch_status) {
	ctx->SetStatus(errors::Internal("cuDNN launch failure : input shape(", input.shape().DebugString(),") filter shape(", filter.shape().DebugString(), ")"));
	}
	} else {
	// ... fallback to Eigen ...
	}
	}
	};
No results found