ConsciousMachines · June 24, 2024 19:22 · goktug7913 · May 24, 2025
diff --git a/numpy_2_cuda.py b/numpy_2_cuda.py
 # send NumPy array data to CUDA code
 # =============================================================================
 # =============================================================================
 # MAIN LESSONS:
 # 1. python natively provides "buffer protocol" which is interface to C array
 # 2. numpy supports this, and offers you a ptr to the C array data
 # 3. use ctypes as native interface to C libraries, send them the ptr
 # 4. no point using other tools as they all break with updates
 # 5. for cuda, compile it as an extern C shared object
 #        need to restart python each time .so changes to reimport w ctypes

 # https://stackoverflow.com/questions/145270/calling-c-c-from-python
 # https://stackoverflow.com/questions/64084033/modern-2020-way-to-call-c-code-from-python

 # gcc -fPIC -shared -o mult.so mult.c
 # fPIC means position independent code, which is good for shared libraries
 # because the code doesn't depend on memory location where it is loaded


 import os                   # to change directories
 import subprocess           # to compile from python side
 import ctypes               # to access shared obj library
 import numpy as np          # to use numpy arrays


 # change directory to where the files are located 
 work_dir = '/home/chad/Desktop/_backups/notes/projects/numpy_2_cuda'
 os.chdir(work_dir)
 os.getcwd()




 # PART 1: compile CUDA code 
 # =============================================================================
 # =============================================================================

 c_source_filename = 'mult.cu'
 shared_obj_filename = c_source_filename.replace('.cu', '.so')

 # modify the CUDA source file 
 c_source_contents = '''
 __global__ void kernel(float *x, int32_t n)
 {
  int32_t i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < n) 
    x[i] *= 2.0f;
 }

 extern "C" void mult(float* x, int32_t n)
 {
  float *d_x;

  cudaMalloc(&d_x, n*sizeof(float)); 
  cudaMemcpy(d_x, x, n*sizeof(float), cudaMemcpyHostToDevice);

  kernel<<<(n+255)/256, 256>>>(d_x, n);

  cudaMemcpy(x, d_x, n*sizeof(float), cudaMemcpyDeviceToHost);
  cudaFree(d_x);
 }
 '''

 # save CUDA source contents to disk 
 with open(os.path.join(work_dir, c_source_filename), 'w') as  f:
    _ = f.write(c_source_contents)

 # compile
 # https://forums.developer.nvidia.com/t/shared-library-creation/4776/10
 os.remove(shared_obj_filename)
 compile_command = '/usr/local/cuda/bin/nvcc --shared --compiler-options -fPIC -shared mult.cu -o mult.so'
 result = subprocess.run(compile_command.split(' '), capture_output=True, text=True)
 assert result.stderr == ''
 result




 # PART 2: use numpy array with CUDA code 
 # =============================================================================
 # =============================================================================

 np_array = np.array([1, 2, 3, 4, 5], dtype=np.float32)
 # https://numpy.org/doc/stable/reference/generated/numpy.ndarray.ctypes.html
 assert np_array.flags['ALIGNED'] == True
 assert np_array.flags['C_CONTIGUOUS'] == True

 # load shared library, specify argument / return types
 my_lib               = ctypes.CDLL('./mult.so')
 my_lib.mult.argtypes = (ctypes.c_void_p, ctypes.c_int)
 my_lib.mult.restype  = None

 # call function with pointer to NumPy array data
 np_array
 my_lib.mult(np_array.ctypes.data, np_array.size)
 np_array
	# send NumPy array data to CUDA code
	# =============================================================================
	# =============================================================================
	# MAIN LESSONS:
	# 1. python natively provides "buffer protocol" which is interface to C array
	# 2. numpy supports this, and offers you a ptr to the C array data
	# 3. use ctypes as native interface to C libraries, send them the ptr
	# 4. no point using other tools as they all break with updates
	# 5. for cuda, compile it as an extern C shared object
	# need to restart python each time .so changes to reimport w ctypes

	# https://stackoverflow.com/questions/145270/calling-c-c-from-python
	# https://stackoverflow.com/questions/64084033/modern-2020-way-to-call-c-code-from-python

	# gcc -fPIC -shared -o mult.so mult.c
	# fPIC means position independent code, which is good for shared libraries
	# because the code doesn't depend on memory location where it is loaded


	import os # to change directories
	import subprocess # to compile from python side
	import ctypes # to access shared obj library
	import numpy as np # to use numpy arrays


	# change directory to where the files are located
	work_dir = '/home/chad/Desktop/_backups/notes/projects/numpy_2_cuda'
	os.chdir(work_dir)
	os.getcwd()




	# PART 1: compile CUDA code
	# =============================================================================
	# =============================================================================

	c_source_filename = 'mult.cu'
	shared_obj_filename = c_source_filename.replace('.cu', '.so')

	# modify the CUDA source file
	c_source_contents = '''
	__global__ void kernel(float *x, int32_t n)
	{
	int32_t i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i < n)
	x[i] *= 2.0f;
	}

	extern "C" void mult(float* x, int32_t n)
	{
	float *d_x;

	cudaMalloc(&d_x, n*sizeof(float));
	cudaMemcpy(d_x, x, n*sizeof(float), cudaMemcpyHostToDevice);

	kernel<<<(n+255)/256, 256>>>(d_x, n);

	cudaMemcpy(x, d_x, n*sizeof(float), cudaMemcpyDeviceToHost);
	cudaFree(d_x);
	}
	'''

	# save CUDA source contents to disk
	with open(os.path.join(work_dir, c_source_filename), 'w') as f:
	_ = f.write(c_source_contents)

	# compile
	# https://forums.developer.nvidia.com/t/shared-library-creation/4776/10
	os.remove(shared_obj_filename)
	compile_command = '/usr/local/cuda/bin/nvcc --shared --compiler-options -fPIC -shared mult.cu -o mult.so'
	result = subprocess.run(compile_command.split(' '), capture_output=True, text=True)
	assert result.stderr == ''
	result




	# PART 2: use numpy array with CUDA code
	# =============================================================================
	# =============================================================================

	np_array = np.array([1, 2, 3, 4, 5], dtype=np.float32)
	# https://numpy.org/doc/stable/reference/generated/numpy.ndarray.ctypes.html
	assert np_array.flags['ALIGNED'] == True
	assert np_array.flags['C_CONTIGUOUS'] == True

	# load shared library, specify argument / return types
	my_lib = ctypes.CDLL('./mult.so')
	my_lib.mult.argtypes = (ctypes.c_void_p, ctypes.c_int)
	my_lib.mult.restype = None

	# call function with pointer to NumPy array data
	np_array
	my_lib.mult(np_array.ctypes.data, np_array.size)
	np_array