machinaut · July 26, 2021 23:08
diff --git a/bench.ipynb b/bench.ipynb
diff --git a/Makefile b/Makefile
 CUDA_PATH ?= /usr/local/cuda

 .PHONY: clean

 vadd.so: vadd.o
 	nvcc -shared $^ -o $@ -lcuda

 vadd.o: vadd.cu
 	nvcc -I $(CUDA_PATH)/include -I$(CUDA_PATH)/samples/common/inc -arch=sm_70 --compiler-options '-fPIC' $^ -c $@

 clean:
 	rm -f *.o *.so
diff --git a/vadd.cu b/vadd.cu
 // For the CUDA runtime routines (prefixed with "cuda_")
 // #include <cuda.h>
 #include <cuda_runtime.h>

 namespace
 {
    __global__ void _vadd(const float *A, const float *B, float *C, int n)
    {
        int i = blockDim.x * blockIdx.x + threadIdx.x;
        if (i < n)
        {
            C[i] = A[i] + B[i];
        }
    }
 }

 extern "C" void vadd(const float *A, const float *B, float *C, int n, int threads)
 {
    const int blocks = (n + threads - 1) / threads;
    _vadd<<<blocks, threads>>>(A, B, C, n);
 }
	CUDA_PATH ?= /usr/local/cuda

	.PHONY: clean

	vadd.so: vadd.o
	nvcc -shared $^ -o $@ -lcuda

	vadd.o: vadd.cu
	nvcc -I $(CUDA_PATH)/include -I$(CUDA_PATH)/samples/common/inc -arch=sm_70 --compiler-options '-fPIC' $^ -c $@

	clean:
	rm -f .o .so
	// For the CUDA runtime routines (prefixed with "cuda_")
	// #include <cuda.h>
	#include <cuda_runtime.h>

	namespace
	{
	__global__ void _vadd(const float A, const float B, float *C, int n)
	{
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	if (i < n)
	{
	C[i] = A[i] + B[i];
	}
	}
	}

	extern "C" void vadd(const float A, const float B, float *C, int n, int threads)
	{
	const int blocks = (n + threads - 1) / threads;
	_vadd<<<blocks, threads>>>(A, B, C, n);
	}