Created
July 26, 2021 23:08
-
-
Save machinaut/30b365d31abb4941fc838e0acb9e5db3 to your computer and use it in GitHub Desktop.
Trying a bare cuda vector add against pytorch and triton
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CUDA_PATH ?= /usr/local/cuda | |
.PHONY: clean | |
vadd.so: vadd.o | |
nvcc -shared $^ -o $@ -lcuda | |
vadd.o: vadd.cu | |
nvcc -I $(CUDA_PATH)/include -I$(CUDA_PATH)/samples/common/inc -arch=sm_70 --compiler-options '-fPIC' $^ -c $@ | |
clean: | |
rm -f *.o *.so |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// For the CUDA runtime routines (prefixed with "cuda_") | |
// #include <cuda.h> | |
#include <cuda_runtime.h> | |
namespace | |
{ | |
__global__ void _vadd(const float *A, const float *B, float *C, int n) | |
{ | |
int i = blockDim.x * blockIdx.x + threadIdx.x; | |
if (i < n) | |
{ | |
C[i] = A[i] + B[i]; | |
} | |
} | |
} | |
extern "C" void vadd(const float *A, const float *B, float *C, int n, int threads) | |
{ | |
const int blocks = (n + threads - 1) / threads; | |
_vadd<<<blocks, threads>>>(A, B, C, n); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment