Created
January 27, 2023 11:49
-
-
Save Ediolot/1ea9c05e999dfdc3ccb24f6a97d96bce to your computer and use it in GitHub Desktop.
CUDA Vector add example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
__global__ void vectorAdd(const float *A, const float *B, float *C, uint32_t N) { | |
const uint32_t threads_per_block = blockDim.x; | |
const uint32_t total_blocks = gridDim.x; | |
const uint32_t block_id = blockIdx.x; | |
const uint32_t thread_id = threadIdx.x; | |
const uint32_t total_threads = total_blocks * threads_per_block; | |
const uint32_t idx = block_id * threads_per_block + thread_id; | |
const uint32_t warp_id = thread_id / 32; | |
const uint32_t thread_id_inside_warp = thread_id % 32; | |
for (uint32_t i = idx; i < N; i += total_threads) { | |
C[i] = A[i] + B[i]; | |
} | |
} | |
int main() { | |
uint32_t N = 1000000000; | |
// Create host vectors | |
auto *h_A = new float[N]; | |
auto *h_B = new float[N]; | |
auto *h_C = new float[N]; | |
// Initialize host vectors | |
for (int i = 0; i < N; i++) { | |
h_A[i] = 1.0f; | |
h_B[i] = 2.0f; | |
h_C[i] = 0; | |
} | |
// Create device vectors | |
float *d_A, *d_B, *d_C; | |
cudaMalloc(&d_A, N * sizeof(float)); | |
cudaMalloc(&d_B, N * sizeof(float)); | |
cudaMalloc(&d_C, N * sizeof(float)); | |
// Copy host vectors to device and measure time taken | |
cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice); | |
cudaMemcpy(d_B, h_B, N * sizeof(float), cudaMemcpyHostToDevice); | |
// Launch kernel and measure time taken | |
vectorAdd<<<256, 256>>>(d_A, d_B, d_C, N); | |
// Copy device vector to host | |
cudaMemcpy(h_C, d_C, N * sizeof(float), cudaMemcpyDeviceToHost); | |
// Check result | |
for (int i = 0; i < N; i++) { | |
if (h_C[i] != 3.0f) { | |
std::cout << "Error: " << h_C[i] << " != 3.0f" << std::endl; | |
break; | |
} | |
} | |
// Free memory | |
cudaFree(d_A); | |
cudaFree(d_B); | |
cudaFree(d_C); | |
delete[] h_A; | |
delete[] h_B; | |
delete[] h_C; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment