Skip to content

Instantly share code, notes, and snippets.

@protoget
Created April 17, 2018 03:02
Show Gist options
  • Save protoget/36c3e8d771f81913bf536453137327cc to your computer and use it in GitHub Desktop.
Save protoget/36c3e8d771f81913bf536453137327cc to your computer and use it in GitHub Desktop.
Reproduce Cudnn RNN error
make && ./RNN

got error: cuDNN Error: CUDNN_STATUS_BAD_PARAM RNN_example.cu 136

# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
# Adjust this for ARMv7 with a 32-bit filesystem
ifeq ($(TARGET_ARCH), aarch64)
ifeq ($(shell file /sbin/init | grep 32-bit), 1)
TARGET_ARCH=armv7l
endif
endif
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq ($(TARGET_OS),QNX)
TARGET_OS := qnx
endif
ifeq (,$(filter $(TARGET_OS),linux darwin qnx QNX android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-g++
endif
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
ifneq ($(TARGET_ARCH), ppc64le)
INCLUDES := -I$(CUDA_PATH)/include
else
INCLUDES := -I$(CUDA_PATH)/targets/ppc64le-linux/include
endif
LIBRARIES :=
################################################################################
# Gencode arguments
SMS ?= 30 35 50 53
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
INCLUDES += -I.
LIBRARIES += -L. -lcublas -lcudnn -lcudart -lstdc++ -lm
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: RNN
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
OBJ = RNN_example.o
RNN: $(OBJ)
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
%.o: %.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
run: build
$(EXEC) ./RNN 100 4 512 64 2
clean:
rm -rf *o
rm -rf RNN
clobber: clean
#include <cudnn.h>
#include <cuda_fp16.h>
#include <cuda.h>
#include <stdio.h>
// Define some error checking macros.
#define cudaErrCheck(stat) { cudaErrCheck_((stat), __FILE__, __LINE__); }
void cudaErrCheck_(cudaError_t stat, const char *file, int line) {
if (stat != cudaSuccess) {
fprintf(stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(stat), file, line);
}
}
#define cudnnErrCheck(stat) { cudnnErrCheck_((stat), __FILE__, __LINE__); }
void cudnnErrCheck_(cudnnStatus_t stat, const char *file, int line) {
if (stat != CUDNN_STATUS_SUCCESS) {
fprintf(stderr, "cuDNN Error: %s %s %d\n", cudnnGetErrorString(stat), file, line);
}
}
int main(int argc, char* argv[]) {
// seqLength = 1
int numLayers = 1;
int hiddenSize = 128;
int inputSize = 128;
int miniBatch = 4;
float dropout = 0;
int persistent = 0;
cudnnHandle_t cudnnHandle;
cudnnErrCheck(cudnnCreate(&cudnnHandle));
/// set up xDesc
cudnnTensorDescriptor_t xDesc;
cudnnErrCheck(cudnnCreateTensorDescriptor(&xDesc));
int dimA[3];
int strideA[3];
dimA[0] = miniBatch;
dimA[1] = inputSize;
dimA[2] = 1;
strideA[0] = dimA[2] * dimA[1];
strideA[1] = dimA[2];
strideA[2] = 1;
cudnnErrCheck(cudnnSetTensorNdDescriptor(xDesc, CUDNN_DATA_HALF, 3, dimA, strideA));
// -------------------------
// Set up the dropout descriptor (needed for the RNN descriptor)
// -------------------------
unsigned long long seed = 1337ull; // Pick a seed.
cudnnDropoutDescriptor_t dropoutDesc;
cudnnErrCheck(cudnnCreateDropoutDescriptor(&dropoutDesc));
// How much memory does dropout need for states?
// These states are used to generate random numbers internally
// and should not be freed until the RNN descriptor is no longer used
size_t stateSize;
void *states;
cudnnErrCheck(cudnnDropoutGetStatesSize(cudnnHandle, &stateSize));
cudaErrCheck(cudaMalloc(&states, stateSize));
cudnnErrCheck(cudnnSetDropoutDescriptor(dropoutDesc,
cudnnHandle,
dropout,
states,
stateSize,
seed));
// -------------------------
// Set up the RNN descriptor
// -------------------------
cudnnRNNDescriptor_t rnnDesc;
cudnnRNNMode_t RNNMode = CUDNN_LSTM;
cudnnRNNAlgo_t RNNAlgo;
cudnnErrCheck(cudnnCreateRNNDescriptor(&rnnDesc));
// Persistent RNNs are only supported on Pascal+ GPUs.
if (persistent == 0) RNNAlgo = CUDNN_RNN_ALGO_STANDARD;
else if (persistent == 1) RNNAlgo = CUDNN_RNN_ALGO_PERSIST_STATIC;
else if (persistent == 2) RNNAlgo = CUDNN_RNN_ALGO_PERSIST_DYNAMIC;
cudnnErrCheck(cudnnSetRNNDescriptor_v6(cudnnHandle,
rnnDesc,
hiddenSize,
numLayers,
dropoutDesc,
CUDNN_LINEAR_INPUT, // We can also skip the input matrix transformation
CUDNN_UNIDIRECTIONAL,
RNNMode,
RNNAlgo, // Can be changed to use persistent RNNs on Pascal+ GPUs.
CUDNN_DATA_FLOAT)); // math precision
cudnnErrCheck(cudnnSetRNNMatrixMathType(rnnDesc, CUDNN_TENSOR_OP_MATH));
// -------------------------
// Set up parameters
// -------------------------
// This needs to be done after the rnn descriptor is set as otherwise
// we don't know how many parameters we have to allocate
void *w;
cudnnFilterDescriptor_t wDesc;
cudnnErrCheck(cudnnCreateFilterDescriptor(&wDesc));
size_t weightsSize;
cudnnErrCheck(cudnnGetRNNParamsSize(cudnnHandle, rnnDesc, xDesc, &weightsSize, CUDNN_DATA_HALF));
int dimW[3];
dimW[0] = weightsSize / sizeof(half);
dimW[1] = 1;
dimW[2] = 1;
cudnnErrCheck(cudnnSetFilterNdDescriptor(wDesc, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, 3, dimW));
cudaErrCheck(cudaMalloc((void**)&w, weightsSize));
// -------------------------
// Set up work space and reserved memory
// -------------------------
// Weights
cudnnFilterDescriptor_t linLayerMatDesc;
cudnnErrCheck(cudnnCreateFilterDescriptor(&linLayerMatDesc));
half *linLayerMat;
cudnnErrCheck(
cudnnGetRNNLinLayerMatrixParams(
cudnnHandle,
rnnDesc,
0, /* layer*/
xDesc,
wDesc,
w,
0, /*linLayerID,*/
linLayerMatDesc,
(void**)&linLayerMat));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment