Skip to content

Instantly share code, notes, and snippets.

@sherjilozair
Created April 23, 2018 12:19
Show Gist options
  • Save sherjilozair/8bee6c1e6e28be5fba01951d05023307 to your computer and use it in GitHub Desktop.
Save sherjilozair/8bee6c1e6e28be5fba01951d05023307 to your computer and use it in GitHub Desktop.
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// This example demonstrates how to use CUDNN library calls cudnnConvolutionForward,
// cudnnConvolutionBackwardData, and cudnnConvolutionBackwardFilter with the option
// to enable Tensor Cores on Volta with cudnnSetConvolutionMathType.
//
// 1. Make sure cuda and cudnn are installed in the same directory.
//
// 2. Run make from the directory of the sample specifying the cuda installation path:
// make CUDA_PATH=<cuda installation path>
//
// 3. Use the following arguments to run sample with different convolution parameters:
// -c2048 -h7 -w7 -k512 -r1 -s1 -pad_h0 -pad_w0 -u1 -v1
// -c512 -h28 -w28 -k128 -r1 -s1 -pad_h0 -pad_w0 -u1 -v1
// -c512 -h28 -w28 -k1024 -r1 -s1 -pad_h0 -pad_w0 -u2 -v2
// -c512 -h28 -w28 -k256 -r1 -s1 -pad_h0 -pad_w0 -u2 -v2
// -c256 -h14 -w14 -k256 -r3 -s3 -pad_h1 -pad_w1 -u1 -v1
// -c256 -h14 -w14 -k1024 -r1 -s1 -pad_h0 -pad_w0 -u1 -v1
// -c1024 -h14 -w14 -k256 -r1 -s1 -pad_h0 -pad_w0 -u1 -v1
// -c1024 -h14 -w14 -k2048 -r1 -s1 -pad_h0 -pad_w0 -u2 -v2
// -c1024 -h14 -w14 -k512 -r1 -s1 -pad_h0 -pad_w0 -u2 -v2
// -c512 -h7 -w7 -k512 -r3 -s3 -pad_h1 -pad_w1 -u1 -v1
// -c512 -h7 -w7 -k2048 -r1 -s1 -pad_h0 -pad_w0 -u1 -v1
// -c2048 -h7 -w7 -k512 -r1 -s1 -pad_h0 -pad_w0 -u1 -v1
//
// 4. Use the following additional arguments to run the layer with different setup:
// -mathType1 : enable Tensor Cores on Volta.
// -dgrad : run cudnnConvolutionBackwardData() instead of cudnnConvolutionForward().
// -wgrad : run cudnnConvolutionBackwardFilter() instead of cudnnConvolutionForward().
// -n<int> : mini batch size. (use -b with large n)
// -b : benchmark mode. Bypass the CPU correctness check.
// -filterFormat1 : Use tensor format CUDNN_TENSOR_NHWC instead of CUDNN_TENSOR_NCHW.
//
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <cuda_runtime.h>
#include <assert.h>
#include <cudnn.h>
#include "fp16_dev.h"
#include "fp16_emu.h"
#define SWITCH_CHAR '-'
#define THRESHOLD 2.0e-2
#if defined(__linux__)
#include <stddef.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/sysinfo.h>
static double second (void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
}
#else
#error unsupported platform
#endif
template <typename T_ELEM> __inline__ cudnnDataType_t getDataType();
template <> __inline__ cudnnDataType_t getDataType<half1>() { return CUDNN_DATA_HALF; }
template <> __inline__ cudnnDataType_t getDataType<float>() { return CUDNN_DATA_FLOAT; }
//Generate uniform numbers [0,1)
static void initImage(float* image, int imageSize) {
static unsigned seed = 123456789;
for (int index = 0; index < imageSize; index++) {
seed = ( 1103515245 * seed + 12345 ) & 0xffffffff;
image[index] = float(seed)*2.3283064e-10; //2^-32
}
}
static void initImage(half1* image, int imageSize) {
static unsigned seed = 123456789;
for (int index = 0; index < imageSize; index++) {
seed = ( 1103515245 * seed + 12345 ) & 0xffffffff;
image[index] = cpu_float2half_rn(float(seed)*2.3283064e-10); //2^-32
}
}
static int checkCudaError(cudaError_t code, const char* expr, const char* file, int line) {
if (code) {
printf("CUDA error at %s:%d, code=%d (%s) in '%s'", file, line, (int) code, cudaGetErrorString(code), expr);
return 1;
}
return 0;
}
#define checkCudaErr(...) do { int err = checkCudaError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); if (err) return err; } while (0)
static int checkCudnnError(cudnnStatus_t code, const char* expr, const char* file, int line) {
if (code) {
printf("CUDNN error at %s:%d, code=%d (%s) in '%s'\n", file, line, (int) code, cudnnGetErrorString(code), expr);
return 1;
}
return 0;
}
static void printPerf( double cudaTime, double cudaGflops, double cudaBandwithGb,
const char *cpuLib, double cpuTime, double cpuGflops, double cpuBandwithGb)
{
printf( "^^^^ CUDA : elapsed = %g sec, ", cudaTime );
if (cudaGflops > 0) printf( "Gflops = %.3f ", cudaGflops );
if (cudaBandwithGb > 0) printf( "Bandwidth = %.3f ", cudaBandwithGb );
printf( "\n");
if (cpuLib) {
printf( "^^^^%s : elapsed = %g sec, ", cpuLib, cpuTime );
if (cpuGflops > 0) printf( "Gflops = %.3f ", cpuGflops );
if (cpuBandwithGb > 0) printf( "Bandwidth = %.3f, ", cpuBandwithGb );
printf( "Speedup %.2f\n", cpuTime/cudaTime );
}
}
static void generateStrides(const int* dimA, int* strideA, int nbDims, bool isNchw) {
if (isNchw) {
strideA[nbDims-1] = 1 ;
for(int d = nbDims-2 ; d >= 0 ; d--) {
strideA[d] = strideA[d+1] * dimA[d+1] ;
}
} else {
strideA[1] = 1;
strideA[nbDims-1] = strideA[1]*dimA[1];
for(int d = nbDims-2 ; d >= 2 ; d--) {
strideA[d] = strideA[d+1] * dimA[d+1] ;
}
strideA[0] = strideA[2]*dimA[2];
}
}
#define checkCudnnErr(...) do { int err = checkCudnnError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); if (err) return err; } while (0)
// Convert a linear index
// i = d_1 s_1 ... s_n + d_2 s_2 ... s_n + d_n-1 s_n + d_n
// into a multidimensional index
// (d_1, d_2, ..., d_n)
void lin2dim(int id, int* ids, const int* dims, int length) {
int idrem = id ;
int prod = 1 ; // accumulates the product of the dimensions
for(int i = length-1; i >= 0; i--) {
ids[i] = (idrem / prod) % dims[i] ;
idrem = id - ids[i] * prod ;
prod *= dims[i] ;
}
}
// Convert a multidimensional index
// (d_1, d_2, ..., d_n)
// into a linear index
// i = d_1 s_1 + ... + d_n s_n
static int dim2lin(const int* ids, const int* strides, int length) {
int res = 0 ;
for(int i = 0 ; i < length ; i++) {
res += ids[i] * strides[i];
}
return res ;
}
static float doFma(float fval, float ival, float tmp) {
return fval*ival+tmp;
}
static float doFma(half1 fval, half1 ival, float tmp) {
return cpu_half2float(fval)*cpu_half2float(ival)+tmp;
}
static void doEpilog(float *out, int idx, float alphaAcc, float beta) {
if( beta == 0.f ) {
out[idx] = alphaAcc;
} else {
out[idx] = alphaAcc + out[idx]*beta;
}
}
static void doEpilog(half1 *out, int idx, float alphaAcc, float beta) {
if( beta == 0.f ) {
out[idx] = cpu_float2half_rn(alphaAcc);
} else {
out[idx] = cpu_float2half_rn(alphaAcc + cpu_half2float(out[idx])*beta);
}
}
template <typename T_ELEM>
static void conv_cpu_ref (
const T_ELEM* inputData,
const T_ELEM* filterData,
T_ELEM* outputData,
float alpha,
float beta,
bool isNchw,
const int* inDims,
const int* filDims,
const int* outDims,
const int* inStride,
const int* outStride,
const int* stride,
const int* pad,
const int* dilation,
int nbDims
) {
int imDims = nbDims - 2 ;
int filStride[8] = {0} ;
generateStrides(filDims, filStride, nbDims, isNchw);
bool isConv = true; //(CUDNN_CONVOLUTION == mode) ;
// Number of pixels in output
int nPixelsOut = 1 ;
for(int i = 2 ; i < nbDims ; i++)
nPixelsOut *= outDims[i] ;
// Number of pixels in filter
int nPixelsFil = 1 ;
for(int i = 2 ; i < nbDims ; i++)
nPixelsFil *= filDims[i] ;
// Used to store coordinates
int filIds[8] = {0} ;
int outIds[8] = {0} ;
int inIds [8] = {0} ;
int tmpIds[8] = {0} ;
// For each image in the output
for(int ni = 0 ; ni < outDims[0] ; ni++) {
// For each feature layer of the output
for(int ki = 0 ; ki < outDims[1] ; ki++) {
int outputOffset = ni * outStride[0] + ki * outStride[1] ;
// Loop over all entries of the result
for(int outId = 0 ; outId < nPixelsOut ; outId++) {
// Get output pixel ids
lin2dim(outId, outIds, outDims+2, imDims) ; // Skip n and k dimensions
// Now we get the coordinates in input space of the "top left" corner of the filter: multiply by stride and remove pad
for(int d = 0 ; d < imDims ; d++) {
inIds[d] = outIds[d] * stride[d] - pad[d] ;
}
// We then accumulate
float tmp = 0.f;
for(int ci = 0 ; ci < inDims[1] ; ci++) {
int inputOffset = ni * inStride[0] + ci * inStride[1] ;
int filterOffset = ki * filStride[0] + ci * filStride[1] ;
for(int filId = 0 ; filId < nPixelsFil ; filId ++) {
// Get the position of the pixel
lin2dim(filId, filIds, filDims+2, imDims) ;
// Compute the corresponding output pixel
// and check wether we are in the padding area on the fly too (not that for convolution, we flip the image patch (equivalent to flipping the filter patch))
bool inside = true ;
for(int d = 0 ; d < imDims && inside ; d++) {
if (isConv) {
tmpIds[d] = inIds[d] + dilation[d] * (filDims[2+d]-1 - filIds[d]) ;
} else {
tmpIds[d] = inIds[d] + dilation[d] * filIds[d] ;
}
inside &= (tmpIds[d] >= 0 && tmpIds[d] < inDims[2+d]) ; // If we are in the padding area: stop and skip computations
}
if(inside) {
int actualTmpId = inputOffset + dim2lin(tmpIds, (inStride)+2, imDims) ;
//int actualFilId = filterOffset + filId ;
int actualFilId = filterOffset + dim2lin(filIds, (filStride)+2, imDims) ;
T_ELEM fval = filterData[actualFilId] ;
T_ELEM ival = inputData [actualTmpId] ;
tmp = doFma(fval, ival, tmp);
}
}
}
// We put the result in the output
int actualOutId = outputOffset + dim2lin(outIds, (outStride)+2, imDims) ;
doEpilog(outputData, actualOutId, alpha*tmp, beta);
}
}
}
}
template<typename T_ELEM>
static void dataGrad_cpu_ref (
const T_ELEM *weight,
const T_ELEM *top_diff,
T_ELEM *output,
float alpha,
float beta,
bool isNchw,
const int* inDims,
const int* filDims,
const int* outDims,
const int* inStride,
const int* outStride,
const int* stride,
const int* pad,
const int* dilation,
int nbDims )
{
// Sanity checks
// output is n x c x h x w
// diff is n x k x p x q
// filter is k x c x r x s
assert(inDims[0] == outDims[0]); // n
assert(inDims[1] == filDims[0]); // k
assert(outDims[1] == filDims[1]); // c
int filStride[8] = {0} ;
generateStrides(filDims, filStride, nbDims, isNchw);
bool isConv = true; //(CUDNN_CONVOLUTION == mode) ;
// For every output pixel (n x c x h x w)
for(int ni = 0; ni < outDims[0]; ni++) {
for(int ci = 0; ci < outDims[1]; ci++) {
for(int hi = 0; hi < outDims[2]; hi++) {
for(int wi = 0; wi < outDims[3]; wi++) {
int outIdx = ni * outStride[0] +
ci * outStride[1] +
hi * outStride[2] +
wi * outStride[3];
float val = 0.0;
// For every diff channel (k)
for(int ki = 0; ki < inDims[1]; ki++) { // Sum over k channels
int offset_filter = ki * filStride[0] + ci * filStride[1];
int offset_diff = ni * inStride[0] + ki * inStride[1];
// For every pixel if filter (r x s)
for(int ri = 0; ri < filDims[2]; ri++) {
int p = hi + pad[0];
if (isConv){
p -= (filDims[2] - 1 - ri) * dilation[0];
} else {
p -= ri * dilation[0];
}
if ( p%stride[0] )
continue;
p/=stride[0];
for(int si = 0; si < filDims[3]; si++) {
int q = wi + pad[1];
// Fetch the value in filter and diff, product and accumulate
// So basically, for the convolution, we replace r by dim-1-r and s by dim-1-s to "flip" the filter
// We can then just reason in term of correlation
if (isConv){
q -= (filDims[3] - 1 - si) * dilation[1];
} else {
q -= si * dilation[1];
}
//Skip if q or p isn't multiple of strides
if ( q%stride[1] )
continue;
q/=stride[1];
int inBounds = ( (p >= 0) && (p < inDims[2]) && (q >= 0) && (q < inDims[3]) );
if (inBounds) {
int filterIdx = offset_filter + ri * filStride[2] + si * filStride[3];
int diffIdx = offset_diff + p * inStride[2] + q * inStride[3];
T_ELEM imTmp = top_diff[diffIdx];
T_ELEM filTmp = weight[filterIdx];
val = doFma(filTmp, imTmp, val);
}
}
}
}
doEpilog(output, outIdx, alpha*val, beta);
}
}
}
}
}
template<typename T_ELEM>
static void weightGrad_cpu_ref(/*const TensorNdTestDesc_t *tensorInputDesc,*/
const T_ELEM *image,
/*const TensorNdTestDesc_t *tensorDiffDesc,*/
const T_ELEM *diffData,
/*const ConvNdTestDesc_t *convDesc,*/
/*const TensorNdTestDesc_t *filterOutputDesc,*/
float alpha,
float beta,
T_ELEM *output,
bool isNchw,
const int* inDims,
const int* filDims,
const int* diffDims,
const int* inStride,
const int* diffStride,
const int* stride,
const int* pad,
const int* dilation,
int nbDims )
{
// Some sanity checks
// image is n x c x h x w
// diff is n x k x p x q
// filter is k x c x r x s
assert(inDims[0] == diffDims[0]) ;
assert(inDims[1] == filDims[1]) ;
assert(diffDims[1] == filDims[0]) ;
// Filter stride
int filterStride[4] ;
generateStrides(filDims, filterStride, nbDims, isNchw);
bool isConv = true; //(CUDNN_CONVOLUTION == mode) ;
// For every filter pixel (k x c x r x s)
for(int ci = 0; ci < inDims[1]; ci++) { // Loop over filter output pixels
for(int ri = 0; ri < filDims[2]; ri++) { // ^
for(int si = 0; si < filDims[3]; si++) { // ^
for(int ki = 0; ki < filDims[0]; ki++){ // ^
int filIdx = ki * filterStride[0] + ci * filterStride[1] + ri * filterStride[2] + si * filterStride[3] ;
float val = 0.f ;
// For every image (n)
for(int ni = 0 ; ni < inDims[0]; ni++) { // Sum over the batch
int offset_image = ni * inStride[0] + ci * inStride[1] ;
int offset_diff = ni * diffStride[0] + ki * diffStride[1] ;
// For every pixel in diff (p x q)
for(int pi = 0; pi < diffDims[2] ; pi++ ) { // Sum over the pixels of diff
for(int qi = 0; qi < diffDims[3] ; qi++ ) { // ^
// Fetch the value in image and diff, product and accumulate
int y = pi * stride[0] - pad[0] ;
int x = qi * stride[1] - pad[1] ;
// Convolution = Correlation with a flipped filter
// So basically, for the convolution, we replace r by dim-1-r and s by dim-1-s to "flip" the filter
// We can then just reason in term of correlation
if (isConv){
y += (filDims[2] - 1 - ri) * dilation[0] ;
x += (filDims[3] - 1 - si) * dilation[1] ;
} else {
// The effect of dilation on the gradient is to start the "zone of influence" of a given pixel further into the image, so dilation
// only produces a shift in x and y
y += ri * dilation[0] ;
x += si * dilation[1] ;
}
// Image value
int inBounds = ((x >=0)&&(x < inDims[3])&&(y >=0)&&(y < inDims[2]));
if (inBounds) {
int imIdx = offset_image + y * inStride[2] + x * inStride[3] ;
// Diff value
int diffIdx = offset_diff + pi * diffStride[2] + qi * diffStride[3] ;
// Prod and accumulate
T_ELEM imTmp = image[imIdx] ;
T_ELEM diffTmp = diffData[diffIdx];
val = doFma(diffTmp, imTmp, val);
}
}
}
}
doEpilog(output, filIdx, alpha*val, beta);
}
}
}
}
}
float getError(float dev, float ref) {
if (ref > 1.0 || ref < -1.0)
return (dev - ref)/ref;
else
return dev - ref;
}
float getError(half1 dev, half1 ref) {
if (cpu_half2float(ref) > 1.0 || cpu_half2float(ref) < -1.0)
return (cpu_half2float(dev) - cpu_half2float(ref))/cpu_half2float(ref);
else
return cpu_half2float(dev) - cpu_half2float(ref);
}
static inline int getFwdConvDilatedFilterDim(int filterDim,
int dilation)
{
return ( (filterDim - 1) * dilation ) + 1 ;
}
static inline int getFwdConvPaddedImageDim(int tensorDim,
int pad)
{
return tensorDim + (2 * pad) ;
}
static inline int getFwdConvOutputDim( int tensorDim,
int pad,
int filterDim,
int stride,
int dilation)
{
int p = (getFwdConvPaddedImageDim(tensorDim, pad) - getFwdConvDilatedFilterDim(filterDim, dilation))/stride + 1;
return(p);
}
template <typename T_ELEM>
int doConv(
cudnnHandle_t handle_,
T_ELEM* devPtrI,
T_ELEM* devPtrF,
T_ELEM* devPtrO,
T_ELEM* hostI,
T_ELEM* hostF,
T_ELEM* hostO,
cudnnTensorDescriptor_t cudnnIdesc,
cudnnFilterDescriptor_t cudnnFdesc,
cudnnTensorDescriptor_t cudnnOdesc,
cudnnConvolutionDescriptor_t cudnnConvDesc,
float alpha,
float beta,
cudnnTensorFormat_t filterFormat,
const int* dimA,
const int* filterdimA,
const int* outdimA,
const int* strideA,
const int* outstrideA,
const int* convstrideA,
const int* padA,
const int* dilationA,
const int benchmark) {
int outsize = outstrideA[0]*outdimA[0];
T_ELEM* hostOfromdev = (T_ELEM*)calloc (outsize, sizeof(hostO[0]) );
cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
void *workSpace = 0;
size_t workSpaceSize;
checkCudnnErr ( cudnnGetConvolutionForwardWorkspaceSize(handle_, cudnnIdesc, cudnnFdesc, cudnnConvDesc,
cudnnOdesc, algo, &workSpaceSize) );
if (workSpaceSize > 0) {
cudaMalloc(&workSpace, workSpaceSize);
}
double start = second();
checkCudnnErr ( cudnnConvolutionForward (handle_,
(void*)(&alpha),
cudnnIdesc, devPtrI,
cudnnFdesc, devPtrF,
cudnnConvDesc,
algo,
workSpace, workSpaceSize,
(void*)(&beta),
cudnnOdesc, devPtrO) );
checkCudaErr( cudaDeviceSynchronize() );
double stop = second();
printPerf( stop - start, 0, 0,
0, 0, 0, 0);
checkCudaErr( cudaMemcpy(hostOfromdev, devPtrO, sizeof(hostO[0]) * outsize, cudaMemcpyDeviceToHost) );
checkCudaErr( cudaDeviceSynchronize() );
if (workSpace) {
cudaFree(workSpace);
workSpace = 0;
}
int numErrors = 0;
if (!benchmark) {
conv_cpu_ref<T_ELEM>( hostI, hostF, hostO, alpha, beta, (filterFormat == CUDNN_TENSOR_NCHW), dimA, filterdimA, outdimA, strideA, outstrideA, convstrideA, padA, dilationA, 4);
for (int index = 0; index < outsize; index++) { // assuming out data is packed
float diff = getError(hostOfromdev[index], hostO[index]);
if (diff < 0) diff = -diff;
if(diff > THRESHOLD) {
numErrors++;
}
}
}
return numErrors;
}
template <typename T_ELEM>
int doDgrad(
cudnnHandle_t handle_,
T_ELEM* devPtrI,
T_ELEM* devPtrF,
T_ELEM* devPtrO,
T_ELEM* hostI,
T_ELEM* hostF,
T_ELEM* hostO,
cudnnTensorDescriptor_t cudnnIdesc,
cudnnFilterDescriptor_t cudnnFdesc,
cudnnTensorDescriptor_t cudnnOdesc,
cudnnConvolutionDescriptor_t cudnnConvDesc,
float alpha,
float beta,
cudnnTensorFormat_t filterFormat,
const int* dimA,
const int* filterdimA,
const int* outdimA,
const int* strideA,
const int* outstrideA,
const int* convstrideA,
const int* padA,
const int* dilationA,
const int benchmark) {
int insize = strideA[0]*dimA[0];
T_ELEM* hostIfromdev = (T_ELEM*)calloc (insize, sizeof(hostI[0]) );
cudnnConvolutionBwdDataAlgo_t algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
void *workSpace = 0;
size_t workSpaceSize;
checkCudnnErr ( cudnnGetConvolutionBackwardDataWorkspaceSize(handle_, cudnnFdesc, cudnnOdesc, cudnnConvDesc,
cudnnIdesc, algo, &workSpaceSize) );
if (workSpaceSize > 0) {
cudaMalloc(&workSpace, workSpaceSize);
}
double start = second();
checkCudnnErr ( cudnnConvolutionBackwardData (handle_,
(void*)(&alpha),
cudnnFdesc, devPtrF,
cudnnOdesc, devPtrO,
cudnnConvDesc,
algo,
workSpace, workSpaceSize,
(void*)(&beta),
cudnnIdesc, devPtrI) );
checkCudaErr( cudaDeviceSynchronize() );
double stop = second();
printPerf( stop - start, 0, 0,
0, 0, 0, 0);
checkCudaErr( cudaMemcpy(hostIfromdev, devPtrI, sizeof(hostI[0]) * insize, cudaMemcpyDeviceToHost) );
checkCudaErr( cudaDeviceSynchronize() );
if (workSpace) {
cudaFree(workSpace);
workSpace = 0;
}
int numErrors = 0;
if (!benchmark) {
dataGrad_cpu_ref<T_ELEM>(hostF, hostO, hostI, alpha, beta, (filterFormat == CUDNN_TENSOR_NCHW), outdimA, filterdimA, dimA, outstrideA, strideA, convstrideA, padA, dilationA, 4);
for (int index = 0; index < insize; index++) { // assuming in data is packed
float diff = getError(hostIfromdev[index], hostI[index]);
if (diff < 0) diff = -diff;
if(diff > THRESHOLD) {
numErrors++;
}
}
}
return numErrors;
}
template <typename T_ELEM>
int doWgrad(
cudnnHandle_t handle_,
T_ELEM* devPtrI,
T_ELEM* devPtrF,
T_ELEM* devPtrO,
T_ELEM* hostI,
T_ELEM* hostF,
T_ELEM* hostO,
cudnnTensorDescriptor_t cudnnIdesc,
cudnnFilterDescriptor_t cudnnFdesc,
cudnnTensorDescriptor_t cudnnOdesc,
cudnnConvolutionDescriptor_t cudnnConvDesc,
float alpha,
float beta,
cudnnTensorFormat_t filterFormat,
const int* dimA,
const int* filterdimA,
const int* outdimA,
const int* strideA,
const int* outstrideA,
const int* convstrideA,
const int* padA,
const int* dilationA,
const int benchmark) {
int filsize = filterdimA[0]*filterdimA[1]*filterdimA[2]*filterdimA[3];
T_ELEM* hostFfromdev = (T_ELEM*)calloc (filsize, sizeof(hostF[0]) );
cudnnConvolutionBwdFilterAlgo_t algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
void *workSpace = 0;
size_t workSpaceSize;
checkCudnnErr ( cudnnGetConvolutionBackwardFilterWorkspaceSize(handle_, cudnnIdesc, cudnnOdesc, cudnnConvDesc,
cudnnFdesc, algo, &workSpaceSize) );
if (workSpaceSize > 0) {
cudaMalloc(&workSpace, workSpaceSize);
}
double start = second();
checkCudnnErr ( cudnnConvolutionBackwardFilter (handle_,
(void*)(&alpha),
cudnnIdesc, devPtrI,
cudnnOdesc, devPtrO,
cudnnConvDesc,
algo,
workSpace, workSpaceSize,
(void*)(&beta),
cudnnFdesc, devPtrF) );
checkCudaErr( cudaDeviceSynchronize() );
double stop = second();
printPerf( stop - start, 0, 0,
0, 0, 0, 0);
checkCudaErr( cudaMemcpy(hostFfromdev, devPtrF, sizeof(hostF[0]) * filsize, cudaMemcpyDeviceToHost) );
checkCudaErr( cudaDeviceSynchronize() );
if (workSpace) {
cudaFree(workSpace);
workSpace = 0;
}
int numErrors = 0;
if (!benchmark) {
weightGrad_cpu_ref<T_ELEM>(hostI, hostO, alpha, beta, hostF, (filterFormat == CUDNN_TENSOR_NCHW), dimA, filterdimA, outdimA, strideA, outstrideA, convstrideA, padA, dilationA, 4);
for (int index = 0; index < filsize; index++) { // assuming in data is packed
float diff = getError(hostFfromdev[index], hostF[index]);
if (diff < 0) diff = -diff;
if(diff > THRESHOLD) {
numErrors++;
}
}
}
return numErrors;
}
template <typename T_ELEM>
int doTest(int algo, int* dimA, int* padA, int* convstrideA, int* filterdimA, cudnnTensorFormat_t filterFormat, int mathType, int benchmark) {
cudnnHandle_t handle_;
T_ELEM* devPtrI;
T_ELEM* devPtrF;
T_ELEM* devPtrO;
T_ELEM* hostI;
T_ELEM* hostF;
T_ELEM* hostO;
cudnnTensorDescriptor_t cudnnIdesc;
cudnnFilterDescriptor_t cudnnFdesc;
cudnnTensorDescriptor_t cudnnOdesc;
cudnnConvolutionDescriptor_t cudnnConvDesc;
int convDim = 2;
float alpha = 1.0f;
float beta = 0.0;
checkCudnnErr(cudnnCreate(&handle_));
checkCudnnErr( cudnnCreateTensorDescriptor( &cudnnIdesc ));
checkCudnnErr( cudnnCreateFilterDescriptor( &cudnnFdesc ));
checkCudnnErr( cudnnCreateTensorDescriptor( &cudnnOdesc ));
checkCudnnErr( cudnnCreateConvolutionDescriptor( &cudnnConvDesc ));
int dilationA[] = {1, 1};
int strideA[] = {8192, 1024, 32, 1};
generateStrides(dimA, strideA, 4, (filterFormat == CUDNN_TENSOR_NCHW));
int insize = strideA[0]*dimA[0];
int filtersize = filterdimA[0]*filterdimA[1]*filterdimA[2]*filterdimA[3];
int outdimA[] = {1, 8, 30, 30};
outdimA[0] = dimA[0];
outdimA[1] = filterdimA[0];
for( int dim = 0; dim < 2; dim++) {
outdimA[dim+2] = getFwdConvOutputDim( dimA[dim+2],
padA[dim],
filterdimA[dim+2],
convstrideA[dim],
dilationA[dim]);
}
int outstrideA[] = {7200, 900, 30, 1};
generateStrides(outdimA, outstrideA, 4, (filterFormat == CUDNN_TENSOR_NCHW));
int outsize = outstrideA[0]*outdimA[0];
cudaMalloc ((void**)&(devPtrI), (insize) * sizeof(devPtrI[0]) );
cudaMalloc ((void**)&(devPtrF), (filtersize) * sizeof(devPtrF[0]) );
cudaMalloc ((void**)&(devPtrO), (outsize) * sizeof(devPtrO[0]) );
hostI = (T_ELEM*)calloc (insize, sizeof(hostI[0]) );
hostF = (T_ELEM*)calloc (filtersize, sizeof(hostF[0]) );
hostO = (T_ELEM*)calloc (outsize, sizeof(hostO[0]) );
initImage(hostI, insize);
initImage(hostF, filtersize);
initImage(hostO, outsize);
checkCudaErr( cudaMemcpy(devPtrI, hostI, sizeof(hostI[0]) * insize, cudaMemcpyHostToDevice));
checkCudaErr( cudaMemcpy(devPtrF, hostF, sizeof(hostF[0]) * filtersize, cudaMemcpyHostToDevice));
checkCudaErr( cudaMemcpy(devPtrO, hostO, sizeof(hostO[0]) * outsize, cudaMemcpyHostToDevice));
checkCudaErr( cudaDeviceSynchronize() );
checkCudnnErr( cudnnSetTensorNdDescriptor(cudnnIdesc, getDataType<T_ELEM>(), convDim+2, dimA, strideA) );
checkCudnnErr( cudnnSetFilterNdDescriptor(cudnnFdesc, getDataType<T_ELEM>(), filterFormat, convDim+2, filterdimA));
checkCudnnErr( cudnnSetConvolutionNdDescriptor(cudnnConvDesc,
convDim,
padA,
convstrideA,
dilationA,
CUDNN_CONVOLUTION,
CUDNN_DATA_FLOAT) );
if (mathType == 1) {
checkCudnnErr( cudnnSetConvolutionMathType(cudnnConvDesc, CUDNN_TENSOR_OP_MATH) );
}
checkCudnnErr( cudnnSetTensorNdDescriptor(cudnnOdesc, getDataType<T_ELEM>(), convDim+2, outdimA, outstrideA) );
int numErrors = 0;
if (algo == 0) {
printf("Testing conv\n");
numErrors = doConv(
handle_,
devPtrI,
devPtrF,
devPtrO,
hostI,
hostF,
hostO,
cudnnIdesc,
cudnnFdesc,
cudnnOdesc,
cudnnConvDesc,
alpha,
beta,
filterFormat,
dimA,
filterdimA,
outdimA,
strideA,
outstrideA,
convstrideA,
padA,
dilationA,
benchmark);
} else if (algo == 1) {
printf("Testing dgrad\n");
numErrors = doDgrad(
handle_,
devPtrI,
devPtrF,
devPtrO,
hostI,
hostF,
hostO,
cudnnIdesc,
cudnnFdesc,
cudnnOdesc,
cudnnConvDesc,
alpha,
beta,
filterFormat,
dimA,
filterdimA,
outdimA,
strideA,
outstrideA,
convstrideA,
padA,
dilationA,
benchmark);
} else {
printf("Testing wgrad\n");
numErrors = doWgrad(
handle_,
devPtrI,
devPtrF,
devPtrO,
hostI,
hostF,
hostO,
cudnnIdesc,
cudnnFdesc,
cudnnOdesc,
cudnnConvDesc,
alpha,
beta,
filterFormat,
dimA,
filterdimA,
outdimA,
strideA,
outstrideA,
convstrideA,
padA,
dilationA,
benchmark);
}
if (!benchmark) {
if (numErrors == 0) {
printf("Test PASSED\n");
} else {
printf("Test FAILED, num errors = %d\n", numErrors);
}
}
if (devPtrI) cudaFree (devPtrI);
if (devPtrF) cudaFree (devPtrF);
if (devPtrO) cudaFree (devPtrO);
if (cudnnIdesc) cudnnDestroyTensorDescriptor(cudnnIdesc);
if (cudnnFdesc) cudnnDestroyFilterDescriptor(cudnnFdesc);
if (cudnnOdesc) cudnnDestroyTensorDescriptor(cudnnOdesc);
if (cudnnConvDesc) cudnnDestroyConvolutionDescriptor(cudnnConvDesc);
return 0;
}
int main( int argc, char** argv )
{
int algo = 0;
int mathType = 0;
int benchmark = 0;
int dimA[] = {1, 8, 32, 32};
int padA[] = {0, 0};
int convstrideA[] = {1, 1};
int filterdimA[] = {8, 8, 3, 3};
cudnnTensorFormat_t filterFormat = CUDNN_TENSOR_NCHW;
int error = 0;
argc -= 1;
argv++;
while (argc) {
if (*argv[0] == SWITCH_CHAR) {
switch (*(argv[0]+1)) {
case 'b':
benchmark = 1;
break;
case 'c':
dimA[1] = atol(argv[0]+2);
filterdimA[1] = dimA[1];
break;
case 'd':
if ( strncmp( argv[0]+1, "dgrad" , strlen("dgrad")) == 0) {
algo = 1;
}
break;
case 'f':
if ( strncmp( argv[0]+1, "filterFormat" , strlen("filterFormat")) == 0) {
filterFormat = (cudnnTensorFormat_t)(atoi(argv[0]+ 1 + strlen("filterFormat")));
}
break;
case 'h':
dimA[2] = atol(argv[0]+2);
break;
case 'k':
filterdimA[0] = atol(argv[0]+2);
break;
case 'm':
if ( strncmp( argv[0]+1, "mathType1" , strlen("mathType1")) == 0) {
mathType = 1;
}
break;
case 'n':
dimA[0] = atol(argv[0]+2);
break;
case 'p':
if ( strncmp( argv[0]+1, "pad_h" , strlen("pad_h")) == 0) {
padA[0] = (int)atol(argv[0]+ 1 + strlen("pad_h"));
}
else if ( strncmp( argv[0]+1, "pad_w" , strlen("pad_w")) == 0) {
padA[1] = (int)atol(argv[0]+ 1 + strlen("pad_w"));
}
break;
case 'r':
filterdimA[2] = atol(argv[0]+2);
break;
case 's':
filterdimA[3] = atol(argv[0]+2);
break;
case 'u':
convstrideA[0] = atol(argv[0]+2);
break;
case 'v':
convstrideA[1] = atol(argv[0]+2);
break;
case 'w':
if ( strncmp( argv[0]+1, "wgrad" , strlen("wgrad")) == 0) {
algo = 2;
}
else dimA[3] = atol(argv[0]+2);
break;
default:
error++;
break;
}
if (error) {
fprintf(stderr, "Unknown switch '%c%s'\n\n", SWITCH_CHAR, argv[0]+1);
return error;
}
}
else {
fprintf(stderr, "Invalid separator '%c' for option '%s'\n\n", *argv[0], argv[0] );
return 1;
}
argc -= 1;
argv++;
}
printf("Testing single precision\n");
doTest<float>(algo, dimA, padA, convstrideA, filterdimA, filterFormat, mathType, benchmark);
printf("Testing half precision (math in single precision)\n");
doTest<half1>(algo, dimA, padA, convstrideA, filterdimA, filterFormat, mathType, benchmark);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment