Created
October 10, 2024 14:17
-
-
Save apowers313/1503c1f337a76713dea930c619e4bd9e to your computer and use it in GitHub Desktop.
CUDA Python Example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# type: ignore | |
import numpy as np | |
from cuda import cuda, nvrtc | |
def _cudaGetErrorEnum(error): | |
if isinstance(error, cuda.CUresult): | |
err, name = cuda.cuGetErrorName(error) | |
return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>" | |
elif isinstance(error, nvrtc.nvrtcResult): | |
return nvrtc.nvrtcGetErrorString(error)[1] | |
else: | |
raise RuntimeError("Unknown error type: {}".format(error)) | |
def checkCudaErrors(result): | |
if result[0].value: | |
raise RuntimeError( | |
"CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0])) | |
) | |
if len(result) == 1: | |
return None | |
elif len(result) == 2: | |
return result[1] | |
else: | |
return result[1:] | |
saxpy = """\ | |
extern "C" __global__ | |
void saxpy(float a, float *x, float *y, float *out, size_t n) | |
{ | |
size_t tid = blockIdx.x * blockDim.x + threadIdx.x; | |
if (tid < n) { | |
out[tid] = a * x[tid] + y[tid]; | |
} | |
} | |
""" | |
# Initialize CUDA Driver API | |
checkCudaErrors(cuda.cuInit(0)) | |
# Retrieve handle for device 0 | |
cuDevice = checkCudaErrors(cuda.cuDeviceGet(0)) | |
# Derive target architecture for device 0 | |
major = checkCudaErrors( | |
cuda.cuDeviceGetAttribute( | |
cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice | |
) | |
) | |
minor = checkCudaErrors( | |
cuda.cuDeviceGetAttribute( | |
cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice | |
) | |
) | |
arch_arg = bytes(f"--gpu-architecture=compute_{major}{minor}", "ascii") | |
# Create program | |
prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(saxpy), b"saxpy.cu", 0, [], [])) | |
# Compile program | |
opts = [b"--fmad=false", arch_arg] | |
checkCudaErrors(nvrtc.nvrtcCompileProgram(prog, 2, opts)) | |
# Get PTX from compilation | |
ptxSize = checkCudaErrors(nvrtc.nvrtcGetPTXSize(prog)) | |
ptx = b" " * ptxSize | |
checkCudaErrors(nvrtc.nvrtcGetPTX(prog, ptx)) | |
# Create context | |
context = checkCudaErrors(cuda.cuCtxCreate(0, cuDevice)) | |
# Load PTX as module data and retrieve function | |
ptx = np.char.array(ptx) | |
# Note: Incompatible --gpu-architecture would be detected here | |
module = checkCudaErrors(cuda.cuModuleLoadData(ptx.ctypes.data)) | |
kernel = checkCudaErrors(cuda.cuModuleGetFunction(module, b"saxpy")) | |
NUM_THREADS = 512 # Threads per block | |
NUM_BLOCKS = 32768 # Blocks per grid | |
a = np.array([2.0], dtype=np.float32) | |
n = np.array(NUM_THREADS * NUM_BLOCKS, dtype=np.uint32) | |
bufferSize = n * a.itemsize | |
hX = np.random.rand(n).astype(dtype=np.float32) | |
hY = np.random.rand(n).astype(dtype=np.float32) | |
hOut = np.zeros(n).astype(dtype=np.float32) | |
dXclass = checkCudaErrors(cuda.cuMemAlloc(bufferSize)) | |
dYclass = checkCudaErrors(cuda.cuMemAlloc(bufferSize)) | |
dOutclass = checkCudaErrors(cuda.cuMemAlloc(bufferSize)) | |
stream = checkCudaErrors(cuda.cuStreamCreate(0)) | |
checkCudaErrors(cuda.cuMemcpyHtoDAsync(dXclass, hX.ctypes.data, bufferSize, stream)) | |
checkCudaErrors(cuda.cuMemcpyHtoDAsync(dYclass, hY.ctypes.data, bufferSize, stream)) | |
# The following code example is not intuitive | |
# Subject to change in a future release | |
dX = np.array([int(dXclass)], dtype=np.uint64) | |
dY = np.array([int(dYclass)], dtype=np.uint64) | |
dOut = np.array([int(dOutclass)], dtype=np.uint64) | |
args = [a, dX, dY, dOut, n] | |
args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) | |
checkCudaErrors( | |
cuda.cuLaunchKernel( | |
kernel, | |
NUM_BLOCKS, # grid x dim | |
1, # grid y dim | |
1, # grid z dim | |
NUM_THREADS, # block x dim | |
1, # block y dim | |
1, # block z dim | |
0, # dynamic shared memory | |
stream, # stream | |
args.ctypes.data, # kernel arguments | |
0, # extra (ignore) | |
) | |
) | |
checkCudaErrors(cuda.cuMemcpyDtoHAsync(hOut.ctypes.data, dOutclass, bufferSize, stream)) | |
checkCudaErrors(cuda.cuStreamSynchronize(stream)) | |
# Assert values are same after running kernel | |
hZ = a * hX + hY | |
if not np.allclose(hOut, hZ): | |
raise ValueError("Error outside tolerance for host-device vectors") | |
checkCudaErrors(cuda.cuStreamDestroy(stream)) | |
checkCudaErrors(cuda.cuMemFree(dXclass)) | |
checkCudaErrors(cuda.cuMemFree(dYclass)) | |
checkCudaErrors(cuda.cuMemFree(dOutclass)) | |
checkCudaErrors(cuda.cuModuleUnload(module)) | |
checkCudaErrors(cuda.cuCtxDestroy(context)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment