protoget · April 17, 2018 03:02
diff --git a/readme.md b/readme.md
diff --git a/Makefile b/Makefile
 # Location of the CUDA Toolkit
 CUDA_PATH ?= /usr/local/cuda

 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)

 # Adjust this for ARMv7 with a 32-bit filesystem
 ifeq ($(TARGET_ARCH), aarch64)
    ifeq ($(shell file /sbin/init | grep 32-bit), 1)
        TARGET_ARCH=armv7l
    endif
 endif
 
 ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
            TARGET_SIZE := 64
        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
            TARGET_SIZE := 32
        endif
    else
        TARGET_SIZE := $(shell getconf LONG_BIT)
    endif
 else
    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
    endif
 endif

 # operating system
 HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 TARGET_OS ?= $(HOST_OS)

 ifeq ($(TARGET_OS),QNX)
 TARGET_OS := qnx
 endif

 ifeq (,$(filter $(TARGET_OS),linux darwin qnx QNX android))
    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
 endif

 # host compiler
 ifeq ($(TARGET_OS),darwin)
    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
        HOST_COMPILER ?= clang++
    endif
 else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
        ifeq ($(TARGET_OS),linux)
            HOST_COMPILER ?= arm-linux-gnueabihf-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
        else ifeq ($(TARGET_OS),android)
            HOST_COMPILER ?= arm-linux-androideabi-g++
        endif
    else ifeq ($(TARGET_ARCH),aarch64)
        ifeq ($(TARGET_OS), linux)
            HOST_COMPILER ?= aarch64-linux-gnu-g++
        else ifeq ($(TARGET_OS), android)
            HOST_COMPILER ?= aarch64-linux-android-g++
        endif
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
    endif
 endif
 HOST_COMPILER ?= g++
 NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)

 # internal flags
 NVCCFLAGS   := -m${TARGET_SIZE}
 CCFLAGS     :=
 LDFLAGS     :=

 # build flags
 ifeq ($(TARGET_OS),darwin)
    LDFLAGS += -rpath $(CUDA_PATH)/lib
    CCFLAGS += -arch $(HOST_ARCH)
 else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
 else ifeq ($(TARGET_OS),android)
    LDFLAGS += -pie
    CCFLAGS += -fpie -fpic -fexceptions
 endif

 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
        endif
    endif
 endif

 # Debug build flags
 ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      BUILD_TYPE := debug
 else
      BUILD_TYPE := release
 endif

 ALL_CCFLAGS :=
 ALL_CCFLAGS += $(NVCCFLAGS)
 ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))

 SAMPLE_ENABLED := 1

 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))

 # Common includes and paths for CUDA
 ifneq ($(TARGET_ARCH), ppc64le)
 INCLUDES := -I$(CUDA_PATH)/include
 else
 INCLUDES := -I$(CUDA_PATH)/targets/ppc64le-linux/include
 endif
 LIBRARIES :=

 ################################################################################

 # Gencode arguments
 SMS ?= 30 35 50 53

 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
 SAMPLE_ENABLED := 0
 endif

 ifeq ($(GENCODE_FLAGS),)
 # Generate SASS code for each SM architecture listed in $(SMS)
 $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))

 # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
 HIGHEST_SM := $(lastword $(sort $(SMS)))
 ifneq ($(HIGHEST_SM),)
 GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
 endif
 endif

 INCLUDES += -I.
 LIBRARIES += -L. -lcublas -lcudnn -lcudart -lstdc++ -lm

 ifeq ($(SAMPLE_ENABLED),0)
 EXEC ?= @echo "[@]"
 endif

 ################################################################################

 # Target rules
 all: build

 build: RNN

 check.deps:
 ifeq ($(SAMPLE_ENABLED),0)
 	@echo "Sample will be waived due to the above missing dependencies"
 else
 	@echo "Sample is ready - all dependencies have been met"
 endif

 OBJ = RNN_example.o

 RNN: $(OBJ)
 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)

 %.o: %.cu
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

 run: build
 	$(EXEC) ./RNN 100 4 512 64 2

 clean:
 	rm -rf *o
 	rm -rf RNN

 clobber: clean
diff --git a/RNN_example.cu b/RNN_example.cu
 #include <cudnn.h>
 #include <cuda_fp16.h>
 #include <cuda.h>
 #include <stdio.h>

 // Define some error checking macros.
 #define cudaErrCheck(stat) { cudaErrCheck_((stat), __FILE__, __LINE__); }
 void cudaErrCheck_(cudaError_t stat, const char *file, int line) {
   if (stat != cudaSuccess) {
      fprintf(stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(stat), file, line);
   }
 }

 #define cudnnErrCheck(stat) { cudnnErrCheck_((stat), __FILE__, __LINE__); }
 void cudnnErrCheck_(cudnnStatus_t stat, const char *file, int line) {
   if (stat != CUDNN_STATUS_SUCCESS) {
      fprintf(stderr, "cuDNN Error: %s %s %d\n", cudnnGetErrorString(stat), file, line);
   }
 }

 int main(int argc, char* argv[]) {

   // seqLength = 1
   int numLayers = 1;
   int hiddenSize = 128;
   int inputSize = 128;
   int miniBatch = 4;
   float dropout = 0;
   int persistent = 0;

   cudnnHandle_t cudnnHandle;
   cudnnErrCheck(cudnnCreate(&cudnnHandle));

   /// set up xDesc
   cudnnTensorDescriptor_t xDesc;
   cudnnErrCheck(cudnnCreateTensorDescriptor(&xDesc));
   int dimA[3];
   int strideA[3];
   dimA[0] = miniBatch;
   dimA[1] = inputSize;
   dimA[2] = 1;
   strideA[0] = dimA[2] * dimA[1];
   strideA[1] = dimA[2];
   strideA[2] = 1;
   cudnnErrCheck(cudnnSetTensorNdDescriptor(xDesc, CUDNN_DATA_HALF, 3, dimA, strideA));

   // -------------------------
   // Set up the dropout descriptor (needed for the RNN descriptor)
   // -------------------------
   unsigned long long seed = 1337ull; // Pick a seed.

   cudnnDropoutDescriptor_t dropoutDesc;
   cudnnErrCheck(cudnnCreateDropoutDescriptor(&dropoutDesc));

   // How much memory does dropout need for states?
   // These states are used to generate random numbers internally
   // and should not be freed until the RNN descriptor is no longer used
   size_t stateSize;
   void *states;
   cudnnErrCheck(cudnnDropoutGetStatesSize(cudnnHandle, &stateSize));
   cudaErrCheck(cudaMalloc(&states, stateSize));
   cudnnErrCheck(cudnnSetDropoutDescriptor(dropoutDesc,
                             cudnnHandle,
                             dropout,
                             states,
                             stateSize,
                             seed));

   // -------------------------
   // Set up the RNN descriptor
   // -------------------------
   cudnnRNNDescriptor_t rnnDesc;
   cudnnRNNMode_t RNNMode = CUDNN_LSTM;
   cudnnRNNAlgo_t RNNAlgo;
   cudnnErrCheck(cudnnCreateRNNDescriptor(&rnnDesc));

   // Persistent RNNs are only supported on Pascal+ GPUs.
   if      (persistent == 0) RNNAlgo = CUDNN_RNN_ALGO_STANDARD;
   else if (persistent == 1) RNNAlgo = CUDNN_RNN_ALGO_PERSIST_STATIC;
   else if (persistent == 2) RNNAlgo = CUDNN_RNN_ALGO_PERSIST_DYNAMIC;

   cudnnErrCheck(cudnnSetRNNDescriptor_v6(cudnnHandle,
                                       rnnDesc,
                                       hiddenSize,
                                       numLayers,
                                       dropoutDesc,
                                       CUDNN_LINEAR_INPUT, // We can also skip the input matrix transformation
                                       CUDNN_UNIDIRECTIONAL,
                                       RNNMode,
                                       RNNAlgo, // Can be changed to use persistent RNNs on Pascal+ GPUs.
                                       CUDNN_DATA_FLOAT)); // math precision

   cudnnErrCheck(cudnnSetRNNMatrixMathType(rnnDesc, CUDNN_TENSOR_OP_MATH));


   // -------------------------
   // Set up parameters
   // -------------------------
   // This needs to be done after the rnn descriptor is set as otherwise
   // we don't know how many parameters we have to allocate
   void *w;

   cudnnFilterDescriptor_t wDesc;
   cudnnErrCheck(cudnnCreateFilterDescriptor(&wDesc));

   size_t weightsSize;
   cudnnErrCheck(cudnnGetRNNParamsSize(cudnnHandle, rnnDesc, xDesc, &weightsSize, CUDNN_DATA_HALF));

   int dimW[3];
   dimW[0] =  weightsSize / sizeof(half);
   dimW[1] = 1;
   dimW[2] = 1;

   cudnnErrCheck(cudnnSetFilterNdDescriptor(wDesc, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, 3, dimW));
   cudaErrCheck(cudaMalloc((void**)&w,  weightsSize));


   // -------------------------
   // Set up work space and reserved memory
   // -------------------------
   // Weights
   cudnnFilterDescriptor_t linLayerMatDesc;
   cudnnErrCheck(cudnnCreateFilterDescriptor(&linLayerMatDesc));
   half *linLayerMat;

   cudnnErrCheck(
           cudnnGetRNNLinLayerMatrixParams(
               cudnnHandle,
               rnnDesc,
               0,  /* layer*/
               xDesc,
               wDesc,
               w,
               0, /*linLayerID,*/
               linLayerMatDesc,
               (void**)&linLayerMat));
 }
	# Location of the CUDA Toolkit
	CUDA_PATH ?= /usr/local/cuda

	# architecture
	HOST_ARCH := $(shell uname -m)
	TARGET_ARCH ?= $(HOST_ARCH)

	# Adjust this for ARMv7 with a 32-bit filesystem
	ifeq ($(TARGET_ARCH), aarch64)
	ifeq ($(shell file /sbin/init \| grep 32-bit), 1)
	TARGET_ARCH=armv7l
	endif
	endif

	ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
	ifneq ($(TARGET_ARCH),$(HOST_ARCH))
	ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
	TARGET_SIZE := 64
	else ifneq (,$(filter $(TARGET_ARCH),armv7l))
	TARGET_SIZE := 32
	endif
	else
	TARGET_SIZE := $(shell getconf LONG_BIT)
	endif
	else
	$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
	endif
	ifneq ($(TARGET_ARCH),$(HOST_ARCH))
	ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
	$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
	endif
	endif

	# operating system
	HOST_OS := $(shell uname -s 2>/dev/null \| tr "[:upper:]" "[:lower:]")
	TARGET_OS ?= $(HOST_OS)

	ifeq ($(TARGET_OS),QNX)
	TARGET_OS := qnx
	endif

	ifeq (,$(filter $(TARGET_OS),linux darwin qnx QNX android))
	$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
	endif

	# host compiler
	ifeq ($(TARGET_OS),darwin)
	ifeq ($(shell expr `xcodebuild -version \| grep -i xcode \| awk '{print $$2}' \| cut -d'.' -f1` \>= 5),1)
	HOST_COMPILER ?= clang++
	endif
	else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
	ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
	ifeq ($(TARGET_OS),linux)
	HOST_COMPILER ?= arm-linux-gnueabihf-g++
	else ifeq ($(TARGET_OS),qnx)
	ifeq ($(QNX_HOST),)
	$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
	endif
	ifeq ($(QNX_TARGET),)
	$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
	endif
	export QNX_HOST
	export QNX_TARGET
	HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
	else ifeq ($(TARGET_OS),android)
	HOST_COMPILER ?= arm-linux-androideabi-g++
	endif
	else ifeq ($(TARGET_ARCH),aarch64)
	ifeq ($(TARGET_OS), linux)
	HOST_COMPILER ?= aarch64-linux-gnu-g++
	else ifeq ($(TARGET_OS), android)
	HOST_COMPILER ?= aarch64-linux-android-g++
	endif
	else ifeq ($(TARGET_ARCH),ppc64le)
	HOST_COMPILER ?= powerpc64le-linux-gnu-g++
	endif
	endif
	HOST_COMPILER ?= g++
	NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)

	# internal flags
	NVCCFLAGS := -m${TARGET_SIZE}
	CCFLAGS :=
	LDFLAGS :=

	# build flags
	ifeq ($(TARGET_OS),darwin)
	LDFLAGS += -rpath $(CUDA_PATH)/lib
	CCFLAGS += -arch $(HOST_ARCH)
	else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
	LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
	CCFLAGS += -mfloat-abi=hard
	else ifeq ($(TARGET_OS),android)
	LDFLAGS += -pie
	CCFLAGS += -fpie -fpic -fexceptions
	endif

	ifneq ($(TARGET_ARCH),$(HOST_ARCH))
	ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
	ifneq ($(TARGET_FS),)
	GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
	ifeq ($(GCCVERSIONLTEQ46),1)
	CCFLAGS += --sysroot=$(TARGET_FS)
	endif
	LDFLAGS += --sysroot=$(TARGET_FS)
	LDFLAGS += -rpath-link=$(TARGET_FS)/lib
	LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
	LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
	endif
	endif
	endif

	# Debug build flags
	ifeq ($(dbg),1)
	NVCCFLAGS += -g -G
	BUILD_TYPE := debug
	else
	BUILD_TYPE := release
	endif

	ALL_CCFLAGS :=
	ALL_CCFLAGS += $(NVCCFLAGS)
	ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
	ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
	ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))

	SAMPLE_ENABLED := 1

	ALL_LDFLAGS :=
	ALL_LDFLAGS += $(ALL_CCFLAGS)
	ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
	ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))

	# Common includes and paths for CUDA
	ifneq ($(TARGET_ARCH), ppc64le)
	INCLUDES := -I$(CUDA_PATH)/include
	else
	INCLUDES := -I$(CUDA_PATH)/targets/ppc64le-linux/include
	endif
	LIBRARIES :=

	################################################################################

	# Gencode arguments
	SMS ?= 30 35 50 53

	ifeq ($(SMS),)
	$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
	SAMPLE_ENABLED := 0
	endif

	ifeq ($(GENCODE_FLAGS),)
	# Generate SASS code for each SM architecture listed in $(SMS)
	$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))

	# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
	HIGHEST_SM := $(lastword $(sort $(SMS)))
	ifneq ($(HIGHEST_SM),)
	GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
	endif
	endif

	INCLUDES += -I.
	LIBRARIES += -L. -lcublas -lcudnn -lcudart -lstdc++ -lm

	ifeq ($(SAMPLE_ENABLED),0)
	EXEC ?= @echo "[@]"
	endif

	################################################################################

	# Target rules
	all: build

	build: RNN

	check.deps:
	ifeq ($(SAMPLE_ENABLED),0)
	@echo "Sample will be waived due to the above missing dependencies"
	else
	@echo "Sample is ready - all dependencies have been met"
	endif

	OBJ = RNN_example.o

	RNN: $(OBJ)
	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)

	%.o: %.cu
	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

	run: build
	$(EXEC) ./RNN 100 4 512 64 2

	clean:
	rm -rf *o
	rm -rf RNN

	clobber: clean
	#include <cudnn.h>
	#include <cuda_fp16.h>
	#include <cuda.h>
	#include <stdio.h>

	// Define some error checking macros.
	#define cudaErrCheck(stat) { cudaErrCheck_((stat), __FILE__, __LINE__); }
	void cudaErrCheck_(cudaError_t stat, const char *file, int line) {
	if (stat != cudaSuccess) {
	fprintf(stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(stat), file, line);
	}
	}

	#define cudnnErrCheck(stat) { cudnnErrCheck_((stat), __FILE__, __LINE__); }
	void cudnnErrCheck_(cudnnStatus_t stat, const char *file, int line) {
	if (stat != CUDNN_STATUS_SUCCESS) {
	fprintf(stderr, "cuDNN Error: %s %s %d\n", cudnnGetErrorString(stat), file, line);
	}
	}

	int main(int argc, char* argv[]) {

	// seqLength = 1
	int numLayers = 1;
	int hiddenSize = 128;
	int inputSize = 128;
	int miniBatch = 4;
	float dropout = 0;
	int persistent = 0;

	cudnnHandle_t cudnnHandle;
	cudnnErrCheck(cudnnCreate(&cudnnHandle));

	/// set up xDesc
	cudnnTensorDescriptor_t xDesc;
	cudnnErrCheck(cudnnCreateTensorDescriptor(&xDesc));
	int dimA[3];
	int strideA[3];
	dimA[0] = miniBatch;
	dimA[1] = inputSize;
	dimA[2] = 1;
	strideA[0] = dimA[2] * dimA[1];
	strideA[1] = dimA[2];
	strideA[2] = 1;
	cudnnErrCheck(cudnnSetTensorNdDescriptor(xDesc, CUDNN_DATA_HALF, 3, dimA, strideA));

	// -------------------------
	// Set up the dropout descriptor (needed for the RNN descriptor)
	// -------------------------
	unsigned long long seed = 1337ull; // Pick a seed.

	cudnnDropoutDescriptor_t dropoutDesc;
	cudnnErrCheck(cudnnCreateDropoutDescriptor(&dropoutDesc));

	// How much memory does dropout need for states?
	// These states are used to generate random numbers internally
	// and should not be freed until the RNN descriptor is no longer used
	size_t stateSize;
	void *states;
	cudnnErrCheck(cudnnDropoutGetStatesSize(cudnnHandle, &stateSize));
	cudaErrCheck(cudaMalloc(&states, stateSize));
	cudnnErrCheck(cudnnSetDropoutDescriptor(dropoutDesc,
	cudnnHandle,
	dropout,
	states,
	stateSize,
	seed));

	// -------------------------
	// Set up the RNN descriptor
	// -------------------------
	cudnnRNNDescriptor_t rnnDesc;
	cudnnRNNMode_t RNNMode = CUDNN_LSTM;
	cudnnRNNAlgo_t RNNAlgo;
	cudnnErrCheck(cudnnCreateRNNDescriptor(&rnnDesc));

	// Persistent RNNs are only supported on Pascal+ GPUs.
	if (persistent == 0) RNNAlgo = CUDNN_RNN_ALGO_STANDARD;
	else if (persistent == 1) RNNAlgo = CUDNN_RNN_ALGO_PERSIST_STATIC;
	else if (persistent == 2) RNNAlgo = CUDNN_RNN_ALGO_PERSIST_DYNAMIC;

	cudnnErrCheck(cudnnSetRNNDescriptor_v6(cudnnHandle,
	rnnDesc,
	hiddenSize,
	numLayers,
	dropoutDesc,
	CUDNN_LINEAR_INPUT, // We can also skip the input matrix transformation
	CUDNN_UNIDIRECTIONAL,
	RNNMode,
	RNNAlgo, // Can be changed to use persistent RNNs on Pascal+ GPUs.
	CUDNN_DATA_FLOAT)); // math precision

	cudnnErrCheck(cudnnSetRNNMatrixMathType(rnnDesc, CUDNN_TENSOR_OP_MATH));


	// -------------------------
	// Set up parameters
	// -------------------------
	// This needs to be done after the rnn descriptor is set as otherwise
	// we don't know how many parameters we have to allocate
	void *w;

	cudnnFilterDescriptor_t wDesc;
	cudnnErrCheck(cudnnCreateFilterDescriptor(&wDesc));

	size_t weightsSize;
	cudnnErrCheck(cudnnGetRNNParamsSize(cudnnHandle, rnnDesc, xDesc, &weightsSize, CUDNN_DATA_HALF));

	int dimW[3];
	dimW[0] = weightsSize / sizeof(half);
	dimW[1] = 1;
	dimW[2] = 1;

	cudnnErrCheck(cudnnSetFilterNdDescriptor(wDesc, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, 3, dimW));
	cudaErrCheck(cudaMalloc((void**)&w, weightsSize));


	// -------------------------
	// Set up work space and reserved memory
	// -------------------------
	// Weights
	cudnnFilterDescriptor_t linLayerMatDesc;
	cudnnErrCheck(cudnnCreateFilterDescriptor(&linLayerMatDesc));
	half *linLayerMat;

	cudnnErrCheck(
	cudnnGetRNNLinLayerMatrixParams(
	cudnnHandle,
	rnnDesc,
	0, /* layer*/
	xDesc,
	wDesc,
	w,
	0, /linLayerID,/
	linLayerMatDesc,
	(void**)&linLayerMat));
	}