Created
June 24, 2024 19:22
-
-
Save ConsciousMachines/abd4b9fe733c5d84e6301bee1fd7dfd4 to your computer and use it in GitHub Desktop.
Process NumPy array with CUDA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# send NumPy array data to CUDA code | |
# ============================================================================= | |
# ============================================================================= | |
# MAIN LESSONS: | |
# 1. python natively provides "buffer protocol" which is interface to C array | |
# 2. numpy supports this, and offers you a ptr to the C array data | |
# 3. use ctypes as native interface to C libraries, send them the ptr | |
# 4. no point using other tools as they all break with updates | |
# 5. for cuda, compile it as an extern C shared object | |
# need to restart python each time .so changes to reimport w ctypes | |
# https://stackoverflow.com/questions/145270/calling-c-c-from-python | |
# https://stackoverflow.com/questions/64084033/modern-2020-way-to-call-c-code-from-python | |
# gcc -fPIC -shared -o mult.so mult.c | |
# fPIC means position independent code, which is good for shared libraries | |
# because the code doesn't depend on memory location where it is loaded | |
import os # to change directories | |
import subprocess # to compile from python side | |
import ctypes # to access shared obj library | |
import numpy as np # to use numpy arrays | |
# change directory to where the files are located | |
work_dir = '/home/chad/Desktop/_backups/notes/projects/numpy_2_cuda' | |
os.chdir(work_dir) | |
os.getcwd() | |
# PART 1: compile CUDA code | |
# ============================================================================= | |
# ============================================================================= | |
c_source_filename = 'mult.cu' | |
shared_obj_filename = c_source_filename.replace('.cu', '.so') | |
# modify the CUDA source file | |
c_source_contents = ''' | |
__global__ void kernel(float *x, int32_t n) | |
{ | |
int32_t i = blockIdx.x * blockDim.x + threadIdx.x; | |
if (i < n) | |
x[i] *= 2.0f; | |
} | |
extern "C" void mult(float* x, int32_t n) | |
{ | |
float *d_x; | |
cudaMalloc(&d_x, n*sizeof(float)); | |
cudaMemcpy(d_x, x, n*sizeof(float), cudaMemcpyHostToDevice); | |
kernel<<<(n+255)/256, 256>>>(d_x, n); | |
cudaMemcpy(x, d_x, n*sizeof(float), cudaMemcpyDeviceToHost); | |
cudaFree(d_x); | |
} | |
''' | |
# save CUDA source contents to disk | |
with open(os.path.join(work_dir, c_source_filename), 'w') as f: | |
_ = f.write(c_source_contents) | |
# compile | |
# https://forums.developer.nvidia.com/t/shared-library-creation/4776/10 | |
os.remove(shared_obj_filename) | |
compile_command = '/usr/local/cuda/bin/nvcc --shared --compiler-options -fPIC -shared mult.cu -o mult.so' | |
result = subprocess.run(compile_command.split(' '), capture_output=True, text=True) | |
assert result.stderr == '' | |
result | |
# PART 2: use numpy array with CUDA code | |
# ============================================================================= | |
# ============================================================================= | |
np_array = np.array([1, 2, 3, 4, 5], dtype=np.float32) | |
# https://numpy.org/doc/stable/reference/generated/numpy.ndarray.ctypes.html | |
assert np_array.flags['ALIGNED'] == True | |
assert np_array.flags['C_CONTIGUOUS'] == True | |
# load shared library, specify argument / return types | |
my_lib = ctypes.CDLL('./mult.so') | |
my_lib.mult.argtypes = (ctypes.c_void_p, ctypes.c_int) | |
my_lib.mult.restype = None | |
# call function with pointer to NumPy array data | |
np_array | |
my_lib.mult(np_array.ctypes.data, np_array.size) | |
np_array |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment