Created
August 4, 2025 12:05
-
-
Save ehzawad/5ff87dc6a6b9047831d48f2a179adca2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// file: vecadd_test.cu | |
#include <cstdio> | |
#include <cstdlib> | |
#define CUDA_CHECK(call) do { \ | |
cudaError_t err = call; \ | |
if (err != cudaSuccess) { \ | |
fprintf(stderr, "CUDA error %s:%d: %s\n", \ | |
__FILE__, __LINE__, cudaGetErrorString(err)); \ | |
exit(EXIT_FAILURE); \ | |
} \ | |
} while (0) | |
__global__ | |
void vecAdd(const float* A, const float* B, float* C, int N) { | |
int i = blockIdx.x * blockDim.x + threadIdx.x; | |
if (i < N) { | |
C[i] = A[i] + B[i]; | |
} | |
} | |
int main() { | |
int N = 1 << 20; // ~1 million elements | |
size_t bytes = N * sizeof(float); | |
float *h_A = (float*)malloc(bytes); | |
float *h_B = (float*)malloc(bytes); | |
float *h_C = (float*)malloc(bytes); | |
if (!h_A || !h_B || !h_C) { | |
fprintf(stderr, "Host malloc failed\n"); | |
return 1; | |
} | |
for (int i = 0; i < N; ++i) { | |
h_A[i] = 1.0f; | |
h_B[i] = 2.0f; | |
} | |
float *d_A, *d_B, *d_C; | |
CUDA_CHECK(cudaMalloc(&d_A, bytes)); | |
CUDA_CHECK(cudaMalloc(&d_B, bytes)); | |
CUDA_CHECK(cudaMalloc(&d_C, bytes)); | |
CUDA_CHECK(cudaMemcpy(d_A, h_A, bytes, cudaMemcpyHostToDevice)); | |
CUDA_CHECK(cudaMemcpy(d_B, h_B, bytes, cudaMemcpyHostToDevice)); | |
int threads = 256; | |
int blocks = (N + threads - 1) / threads; | |
cudaEvent_t start, stop; | |
CUDA_CHECK(cudaEventCreate(&start)); | |
CUDA_CHECK(cudaEventCreate(&stop)); | |
CUDA_CHECK(cudaEventRecord(start)); | |
vecAdd<<<blocks, threads>>>(d_A, d_B, d_C, N); | |
CUDA_CHECK(cudaPeekAtLastError()); // capture launch errors | |
CUDA_CHECK(cudaDeviceSynchronize()); // wait and capture runtime errors | |
CUDA_CHECK(cudaEventRecord(stop)); | |
CUDA_CHECK(cudaEventSynchronize(stop)); | |
float ms = 0.0f; | |
CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop)); | |
CUDA_CHECK(cudaMemcpy(h_C, d_C, bytes, cudaMemcpyDeviceToHost)); | |
// simple validation | |
bool ok = true; | |
for (int i = 0; i < N; ++i) { | |
float expected = 3.0f; | |
if (fabsf(h_C[i] - expected) > 1e-5f) { | |
ok = false; | |
fprintf(stderr, "Mismatch at %d: %f vs %f\n", i, h_C[i], expected); | |
break; | |
} | |
} | |
double gigaAddsPerSec = (double)N / (ms * 1e-3) / 1e9; // in GOps/s | |
printf("Vector add %s; time ms: %.3f; throughput: %.3f GElements/s\n", | |
ok ? "PASSED" : "FAILED", ms, gigaAddsPerSec); | |
// cleanup | |
CUDA_CHECK(cudaFree(d_A)); | |
CUDA_CHECK(cudaFree(d_B)); | |
CUDA_CHECK(cudaFree(d_C)); | |
free(h_A); | |
free(h_B); | |
free(h_C); | |
CUDA_CHECK(cudaEventDestroy(start)); | |
CUDA_CHECK(cudaEventDestroy(stop)); | |
return ok ? 0 : 1; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment