Skip to content

Instantly share code, notes, and snippets.

@apowers313
Created October 10, 2024 14:19
Show Gist options
  • Save apowers313/d2b92ff2058de8cd42ecf29f4ed9cbe0 to your computer and use it in GitHub Desktop.
Save apowers313/d2b92ff2058de8cd42ecf29f4ed9cbe0 to your computer and use it in GitHub Desktop.
CUDA Python Simple Kernel With Argument
import ctypes
from typing import Any
import numpy as np
from cuda import cuda, cudart, nvrtc
cuda_code = """
extern "C" __global__ void simple(char *str) {
printf("this is a test\\n");
printf("passed argument was: %s\\n", str);
}
"""
str_arg = "hello from host"
prog_name = "simple"
grid = (1, 1, 1)
block = (1, 1, 1)
device_id = 0
# Error checking helper
def checkCudaErrors(result: tuple[Any, ...]) -> Any:
def _cudaGetErrorEnum(error: Any) -> Any:
if isinstance(error, cuda.CUresult):
err, name = cuda.cuGetErrorName(error)
return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
elif isinstance(error, nvrtc.nvrtcResult):
return nvrtc.nvrtcGetErrorString(error)[1]
else:
raise RuntimeError("Unknown error type: {}".format(error))
if result[0].value:
raise RuntimeError(
"CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0]))
)
if len(result) == 1:
return None
elif len(result) == 2:
return result[1]
else:
return result[1:]
# Init CUDA
checkCudaErrors(cuda.cuInit(0))
# Create device
nv_device = checkCudaErrors(cuda.cuDeviceGet(device_id))
# Create context
nv_context = checkCudaErrors(cuda.cuCtxCreate(0, nv_device))
# Create stream
nv_stream = checkCudaErrors(cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_DEFAULT))
# Create program
nv_prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(cuda_code.encode(), b"test.cu", 0, [], []))
# Compile code
compile_result = checkCudaErrors(nvrtc.nvrtcCompileProgram(nv_prog, 0, []))
# Get PTX from compilation
nv_ptx_size = checkCudaErrors(nvrtc.nvrtcGetPTXSize(nv_prog))
ptx = b" " * nv_ptx_size
checkCudaErrors(nvrtc.nvrtcGetPTX(nv_prog, ptx))
# Load PTX as module data
ptx = np.char.array(ptx)
ret = cuda.cuModuleLoadData(ptx.ctypes.data)
nv_module = checkCudaErrors(ret)
# Get kernel from module
nv_kernel = checkCudaErrors(cuda.cuModuleGetFunction(nv_module, prog_name.encode()))
# Create string argument
str_arg_buffer = bytearray(str_arg.encode())
str_arg_buffer.append(0) # trailing nul for C string, not really sure if this is necessary
str_arg_len = len(str_arg) + 1
# Allocate device memory
# TODO: cuMemAlloc causes cuLaunchKernel to fail with code=700(b'CUDA_ERROR_ILLEGAL_ADDRESS')
# nv_device_memory = checkCudaErrors(cuda.cuMemAlloc(str_arg_len))
nv_device_memory = checkCudaErrors(cudart.cudaMalloc(str_arg_len))
# Copy string from host to device
cuda.cuMemcpyHtoD(
nv_device_memory,
str_arg_buffer,
str_arg_len,
)
# Launch kernel
print("*** LAUNCHING KERNEL ***")
arg_data = [nv_device_memory]
arg_types = [ctypes.c_void_p]
nv_args = (tuple(arg_data), tuple(arg_types))
checkCudaErrors(
cuda.cuLaunchKernel(
nv_kernel,
grid[0], # grid x dim
grid[1], # grid y dim
grid[2], # grid z dim
block[0], # block x dim
block[1], # block y dim
block[2], # block z dim
0, # dynamic shared memory
nv_stream, # stream
nv_args, # kernel arguments
0, # extra (ignore)
)
)
# Synchronize with device before exiting
checkCudaErrors(cuda.cuStreamSynchronize(nv_stream))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment