Last active
October 13, 2024 14:39
-
-
Save apowers313/96428b4bae27161127b464b331a45070 to your computer and use it in GitHub Desktop.
Demonstration of Running a CUDA Python Graph Glitch (Bug?)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# type: ignore | |
import ctypes | |
from typing import Any | |
import numpy as np | |
from cuda import cuda, cudart, nvrtc | |
do_bug = True | |
ref_keeper = dict() | |
# Error checking helper | |
def checkCudaErrors(result: tuple[Any, ...]) -> Any: | |
def _cudaGetErrorEnum(error: Any) -> Any: | |
if isinstance(error, cuda.CUresult): | |
err, name = cuda.cuGetErrorName(error) | |
return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>" | |
elif isinstance(error, nvrtc.nvrtcResult): | |
return nvrtc.nvrtcGetErrorString(error)[1] | |
else: | |
raise RuntimeError("Unknown error type: {}".format(error)) | |
if result[0].value: | |
raise RuntimeError( | |
"CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0])) | |
) | |
if len(result) == 1: | |
return None | |
elif len(result) == 2: | |
return result[1] | |
else: | |
return result[1:] | |
def mkgraph(): | |
cuda_code = """ | |
extern "C" __global__ void simple(char *str) { | |
printf("this is a test\\n"); | |
printf("ptr: %p\\n", str); | |
printf("passed argument was: %s\\n", str); | |
} | |
""" | |
str_arg = "hello from host" | |
prog_name = "simple" | |
grid = (1, 1, 1) | |
block = (1, 1, 1) | |
device_id = 0 | |
# Init CUDA | |
checkCudaErrors(cuda.cuInit(0)) | |
# Create device | |
nv_device = checkCudaErrors(cuda.cuDeviceGet(device_id)) | |
# Create context | |
nv_context = checkCudaErrors(cuda.cuCtxCreate(0, nv_device)) | |
# Create stream | |
nv_stream = checkCudaErrors(cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_DEFAULT)) | |
# Create program | |
nv_prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(cuda_code.encode(), b"test.cu", 0, [], [])) | |
# Compile code | |
compile_result = checkCudaErrors(nvrtc.nvrtcCompileProgram(nv_prog, 0, [])) | |
# Get PTX from compilation | |
nv_ptx_size = checkCudaErrors(nvrtc.nvrtcGetPTXSize(nv_prog)) | |
ptx = b" " * nv_ptx_size | |
checkCudaErrors(nvrtc.nvrtcGetPTX(nv_prog, ptx)) | |
# Load PTX as module data | |
ptx = np.char.array(ptx) | |
ret = cuda.cuModuleLoadData(ptx.ctypes.data) | |
nv_module = checkCudaErrors(ret) | |
# Get kernel from module | |
nv_kernel = checkCudaErrors(cuda.cuModuleGetFunction(nv_module, prog_name.encode())) | |
# Create graph | |
nv_graph = checkCudaErrors(cuda.cuGraphCreate(0)) | |
# Create string argument | |
str_arg_buffer = bytearray(str_arg.encode()) | |
str_arg_buffer.append(0) # trailing nul for C string, not really sure if this is necessary | |
str_arg_len = len(str_arg) + 1 | |
if not do_bug: | |
ref_keeper["buf"] = str_arg_buffer | |
# Allocate device memory | |
# TODO: cuMemAlloc causes cuLaunchKernel to fail with code=700(b'CUDA_ERROR_ILLEGAL_ADDRESS') | |
# nv_device_memory = checkCudaErrors(cuda.cuMemAlloc(str_arg_len)) | |
nv_device_memory = checkCudaErrors(cudart.cudaMalloc(str_arg_len)) | |
# Create memcpy node to copy string to device | |
memcpy_node = checkCudaErrors( | |
cudart.cudaGraphAddMemcpyNode1D( | |
nv_graph, | |
None, | |
0, | |
nv_device_memory, | |
str_arg_buffer, | |
str_arg_len, | |
cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, | |
) | |
) | |
# Create graph kernel node | |
arg_data = [nv_device_memory] | |
arg_types = [ctypes.c_void_p] | |
nv_args = (tuple(arg_data), tuple(arg_types)) | |
nv_kernel_node_params = cuda.CUDA_KERNEL_NODE_PARAMS() | |
nv_kernel_node_params.func = nv_kernel | |
nv_kernel_node_params.gridDimX = grid[0] | |
nv_kernel_node_params.gridDimY = grid[1] | |
nv_kernel_node_params.gridDimZ = grid[2] | |
nv_kernel_node_params.blockDimX = block[0] | |
nv_kernel_node_params.blockDimY = block[1] | |
nv_kernel_node_params.blockDimZ = block[2] | |
nv_kernel_node_params.sharedMemBytes = 0 | |
nv_kernel_node_params.kernelParams = nv_args | |
kern_node = checkCudaErrors(cuda.cuGraphAddKernelNode(nv_graph, None, 0, nv_kernel_node_params)) | |
# Kernel node depends on memcpy node | |
# checkCudaErrors(cudart.cudaGraphAddDependencies(nv_graph, [memcpy_node], [kern_node], 1)) | |
checkCudaErrors(cuda.cuGraphAddDependencies(nv_graph, [memcpy_node], [kern_node], 1)) | |
# Launch graph | |
print("*** LAUNCHING GRAPH IN FUNCTION ***") | |
nv_graph_exec = checkCudaErrors(cuda.cuGraphInstantiate(nv_graph, 0)) | |
checkCudaErrors(cuda.cuGraphLaunch(nv_graph_exec, nv_stream)) | |
# Synchronize with device before exiting | |
checkCudaErrors(cuda.cuStreamSynchronize(nv_stream)) | |
return nv_graph | |
g = mkgraph() | |
# Create stream | |
nv_stream = checkCudaErrors(cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_DEFAULT)) | |
# Launch graph | |
print("*** LAUNCHING GRAPH OUTSIDE FUNCTION ***") | |
nv_graph_exec = checkCudaErrors(cuda.cuGraphInstantiate(g, 0)) | |
checkCudaErrors(cuda.cuGraphLaunch(nv_graph_exec, nv_stream)) | |
# Synchronize with device before exiting | |
checkCudaErrors(cuda.cuStreamSynchronize(nv_stream)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment