apowers313 · October 10, 2024 14:19
diff --git a/simple_kernel_with_arg.py b/simple_kernel_with_arg.py
 import ctypes
 from typing import Any

 import numpy as np
 from cuda import cuda, cudart, nvrtc

 cuda_code = """
 extern "C" __global__ void simple(char *str) {
    printf("this is a test\\n");
    printf("passed argument was: %s\\n", str);
 }
 """
 str_arg = "hello from host"
 prog_name = "simple"
 grid = (1, 1, 1)
 block = (1, 1, 1)
 device_id = 0


 # Error checking helper
 def checkCudaErrors(result: tuple[Any, ...]) -> Any:
    def _cudaGetErrorEnum(error: Any) -> Any:
        if isinstance(error, cuda.CUresult):
            err, name = cuda.cuGetErrorName(error)
            return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
        elif isinstance(error, nvrtc.nvrtcResult):
            return nvrtc.nvrtcGetErrorString(error)[1]
        else:
            raise RuntimeError("Unknown error type: {}".format(error))

    if result[0].value:
        raise RuntimeError(
            "CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0]))
        )
    if len(result) == 1:
        return None
    elif len(result) == 2:
        return result[1]
    else:
        return result[1:]


 # Init CUDA
 checkCudaErrors(cuda.cuInit(0))

 # Create device
 nv_device = checkCudaErrors(cuda.cuDeviceGet(device_id))

 # Create context
 nv_context = checkCudaErrors(cuda.cuCtxCreate(0, nv_device))

 # Create stream
 nv_stream = checkCudaErrors(cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_DEFAULT))

 # Create program
 nv_prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(cuda_code.encode(), b"test.cu", 0, [], []))

 # Compile code
 compile_result = checkCudaErrors(nvrtc.nvrtcCompileProgram(nv_prog, 0, []))

 # Get PTX from compilation
 nv_ptx_size = checkCudaErrors(nvrtc.nvrtcGetPTXSize(nv_prog))
 ptx = b" " * nv_ptx_size
 checkCudaErrors(nvrtc.nvrtcGetPTX(nv_prog, ptx))

 # Load PTX as module data
 ptx = np.char.array(ptx)
 ret = cuda.cuModuleLoadData(ptx.ctypes.data)
 nv_module = checkCudaErrors(ret)

 # Get kernel from module
 nv_kernel = checkCudaErrors(cuda.cuModuleGetFunction(nv_module, prog_name.encode()))

 # Create string argument
 str_arg_buffer = bytearray(str_arg.encode())
 str_arg_buffer.append(0)  # trailing nul for C string, not really sure if this is necessary
 str_arg_len = len(str_arg) + 1

 # Allocate device memory
 # TODO: cuMemAlloc causes cuLaunchKernel to fail with code=700(b'CUDA_ERROR_ILLEGAL_ADDRESS')
 # nv_device_memory = checkCudaErrors(cuda.cuMemAlloc(str_arg_len))
 nv_device_memory = checkCudaErrors(cudart.cudaMalloc(str_arg_len))

 # Copy string from host to device
 cuda.cuMemcpyHtoD(
    nv_device_memory,
    str_arg_buffer,
    str_arg_len,
 )

 # Launch kernel
 print("*** LAUNCHING KERNEL ***")
 arg_data = [nv_device_memory]
 arg_types = [ctypes.c_void_p]
 nv_args = (tuple(arg_data), tuple(arg_types))
 checkCudaErrors(
    cuda.cuLaunchKernel(
        nv_kernel,
        grid[0],  # grid x dim
        grid[1],  # grid y dim
        grid[2],  # grid z dim
        block[0],  # block x dim
        block[1],  # block y dim
        block[2],  # block z dim
        0,  # dynamic shared memory
        nv_stream,  # stream
        nv_args,  # kernel arguments
        0,  # extra (ignore)
    )
 )

 # Synchronize with device before exiting
 checkCudaErrors(cuda.cuStreamSynchronize(nv_stream))
	import ctypes
	from typing import Any

	import numpy as np
	from cuda import cuda, cudart, nvrtc

	cuda_code = """
	extern "C" __global__ void simple(char *str) {
	printf("this is a test\\n");
	printf("passed argument was: %s\\n", str);
	}
	"""
	str_arg = "hello from host"
	prog_name = "simple"
	grid = (1, 1, 1)
	block = (1, 1, 1)
	device_id = 0


	# Error checking helper
	def checkCudaErrors(result: tuple[Any, ...]) -> Any:
	def _cudaGetErrorEnum(error: Any) -> Any:
	if isinstance(error, cuda.CUresult):
	err, name = cuda.cuGetErrorName(error)
	return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
	elif isinstance(error, nvrtc.nvrtcResult):
	return nvrtc.nvrtcGetErrorString(error)[1]
	else:
	raise RuntimeError("Unknown error type: {}".format(error))

	if result[0].value:
	raise RuntimeError(
	"CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0]))
	)
	if len(result) == 1:
	return None
	elif len(result) == 2:
	return result[1]
	else:
	return result[1:]


	# Init CUDA
	checkCudaErrors(cuda.cuInit(0))

	# Create device
	nv_device = checkCudaErrors(cuda.cuDeviceGet(device_id))

	# Create context
	nv_context = checkCudaErrors(cuda.cuCtxCreate(0, nv_device))

	# Create stream
	nv_stream = checkCudaErrors(cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_DEFAULT))

	# Create program
	nv_prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(cuda_code.encode(), b"test.cu", 0, [], []))

	# Compile code
	compile_result = checkCudaErrors(nvrtc.nvrtcCompileProgram(nv_prog, 0, []))

	# Get PTX from compilation
	nv_ptx_size = checkCudaErrors(nvrtc.nvrtcGetPTXSize(nv_prog))
	ptx = b" " * nv_ptx_size
	checkCudaErrors(nvrtc.nvrtcGetPTX(nv_prog, ptx))

	# Load PTX as module data
	ptx = np.char.array(ptx)
	ret = cuda.cuModuleLoadData(ptx.ctypes.data)
	nv_module = checkCudaErrors(ret)

	# Get kernel from module
	nv_kernel = checkCudaErrors(cuda.cuModuleGetFunction(nv_module, prog_name.encode()))

	# Create string argument
	str_arg_buffer = bytearray(str_arg.encode())
	str_arg_buffer.append(0) # trailing nul for C string, not really sure if this is necessary
	str_arg_len = len(str_arg) + 1

	# Allocate device memory
	# TODO: cuMemAlloc causes cuLaunchKernel to fail with code=700(b'CUDA_ERROR_ILLEGAL_ADDRESS')
	# nv_device_memory = checkCudaErrors(cuda.cuMemAlloc(str_arg_len))
	nv_device_memory = checkCudaErrors(cudart.cudaMalloc(str_arg_len))

	# Copy string from host to device
	cuda.cuMemcpyHtoD(
	nv_device_memory,
	str_arg_buffer,
	str_arg_len,
	)

	# Launch kernel
	print("* LAUNCHING KERNEL *")
	arg_data = [nv_device_memory]
	arg_types = [ctypes.c_void_p]
	nv_args = (tuple(arg_data), tuple(arg_types))
	checkCudaErrors(
	cuda.cuLaunchKernel(
	nv_kernel,
	grid[0], # grid x dim
	grid[1], # grid y dim
	grid[2], # grid z dim
	block[0], # block x dim
	block[1], # block y dim
	block[2], # block z dim
	0, # dynamic shared memory
	nv_stream, # stream
	nv_args, # kernel arguments
	0, # extra (ignore)
	)
	)

	# Synchronize with device before exiting
	checkCudaErrors(cuda.cuStreamSynchronize(nv_stream))