Skip to content

Instantly share code, notes, and snippets.

@ConsciousMachines
Created June 24, 2024 19:22
Show Gist options
  • Save ConsciousMachines/abd4b9fe733c5d84e6301bee1fd7dfd4 to your computer and use it in GitHub Desktop.
Save ConsciousMachines/abd4b9fe733c5d84e6301bee1fd7dfd4 to your computer and use it in GitHub Desktop.
Process NumPy array with CUDA
# send NumPy array data to CUDA code
# =============================================================================
# =============================================================================
# MAIN LESSONS:
# 1. python natively provides "buffer protocol" which is interface to C array
# 2. numpy supports this, and offers you a ptr to the C array data
# 3. use ctypes as native interface to C libraries, send them the ptr
# 4. no point using other tools as they all break with updates
# 5. for cuda, compile it as an extern C shared object
# need to restart python each time .so changes to reimport w ctypes
# https://stackoverflow.com/questions/145270/calling-c-c-from-python
# https://stackoverflow.com/questions/64084033/modern-2020-way-to-call-c-code-from-python
# gcc -fPIC -shared -o mult.so mult.c
# fPIC means position independent code, which is good for shared libraries
# because the code doesn't depend on memory location where it is loaded
import os # to change directories
import subprocess # to compile from python side
import ctypes # to access shared obj library
import numpy as np # to use numpy arrays
# change directory to where the files are located
work_dir = '/home/chad/Desktop/_backups/notes/projects/numpy_2_cuda'
os.chdir(work_dir)
os.getcwd()
# PART 1: compile CUDA code
# =============================================================================
# =============================================================================
c_source_filename = 'mult.cu'
shared_obj_filename = c_source_filename.replace('.cu', '.so')
# modify the CUDA source file
c_source_contents = '''
__global__ void kernel(float *x, int32_t n)
{
int32_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
x[i] *= 2.0f;
}
extern "C" void mult(float* x, int32_t n)
{
float *d_x;
cudaMalloc(&d_x, n*sizeof(float));
cudaMemcpy(d_x, x, n*sizeof(float), cudaMemcpyHostToDevice);
kernel<<<(n+255)/256, 256>>>(d_x, n);
cudaMemcpy(x, d_x, n*sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_x);
}
'''
# save CUDA source contents to disk
with open(os.path.join(work_dir, c_source_filename), 'w') as f:
_ = f.write(c_source_contents)
# compile
# https://forums.developer.nvidia.com/t/shared-library-creation/4776/10
os.remove(shared_obj_filename)
compile_command = '/usr/local/cuda/bin/nvcc --shared --compiler-options -fPIC -shared mult.cu -o mult.so'
result = subprocess.run(compile_command.split(' '), capture_output=True, text=True)
assert result.stderr == ''
result
# PART 2: use numpy array with CUDA code
# =============================================================================
# =============================================================================
np_array = np.array([1, 2, 3, 4, 5], dtype=np.float32)
# https://numpy.org/doc/stable/reference/generated/numpy.ndarray.ctypes.html
assert np_array.flags['ALIGNED'] == True
assert np_array.flags['C_CONTIGUOUS'] == True
# load shared library, specify argument / return types
my_lib = ctypes.CDLL('./mult.so')
my_lib.mult.argtypes = (ctypes.c_void_p, ctypes.c_int)
my_lib.mult.restype = None
# call function with pointer to NumPy array data
np_array
my_lib.mult(np_array.ctypes.data, np_array.size)
np_array
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment