Skip to content

Instantly share code, notes, and snippets.

@ehzawad
Created August 4, 2025 12:05
Show Gist options
  • Save ehzawad/5ff87dc6a6b9047831d48f2a179adca2 to your computer and use it in GitHub Desktop.
Save ehzawad/5ff87dc6a6b9047831d48f2a179adca2 to your computer and use it in GitHub Desktop.
// file: vecadd_test.cu
#include <cstdio>
#include <cstdlib>
#define CUDA_CHECK(call) do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error %s:%d: %s\n", \
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while (0)
__global__
void vecAdd(const float* A, const float* B, float* C, int N) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) {
C[i] = A[i] + B[i];
}
}
int main() {
int N = 1 << 20; // ~1 million elements
size_t bytes = N * sizeof(float);
float *h_A = (float*)malloc(bytes);
float *h_B = (float*)malloc(bytes);
float *h_C = (float*)malloc(bytes);
if (!h_A || !h_B || !h_C) {
fprintf(stderr, "Host malloc failed\n");
return 1;
}
for (int i = 0; i < N; ++i) {
h_A[i] = 1.0f;
h_B[i] = 2.0f;
}
float *d_A, *d_B, *d_C;
CUDA_CHECK(cudaMalloc(&d_A, bytes));
CUDA_CHECK(cudaMalloc(&d_B, bytes));
CUDA_CHECK(cudaMalloc(&d_C, bytes));
CUDA_CHECK(cudaMemcpy(d_A, h_A, bytes, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_B, h_B, bytes, cudaMemcpyHostToDevice));
int threads = 256;
int blocks = (N + threads - 1) / threads;
cudaEvent_t start, stop;
CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&stop));
CUDA_CHECK(cudaEventRecord(start));
vecAdd<<<blocks, threads>>>(d_A, d_B, d_C, N);
CUDA_CHECK(cudaPeekAtLastError()); // capture launch errors
CUDA_CHECK(cudaDeviceSynchronize()); // wait and capture runtime errors
CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
float ms = 0.0f;
CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));
CUDA_CHECK(cudaMemcpy(h_C, d_C, bytes, cudaMemcpyDeviceToHost));
// simple validation
bool ok = true;
for (int i = 0; i < N; ++i) {
float expected = 3.0f;
if (fabsf(h_C[i] - expected) > 1e-5f) {
ok = false;
fprintf(stderr, "Mismatch at %d: %f vs %f\n", i, h_C[i], expected);
break;
}
}
double gigaAddsPerSec = (double)N / (ms * 1e-3) / 1e9; // in GOps/s
printf("Vector add %s; time ms: %.3f; throughput: %.3f GElements/s\n",
ok ? "PASSED" : "FAILED", ms, gigaAddsPerSec);
// cleanup
CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_B));
CUDA_CHECK(cudaFree(d_C));
free(h_A);
free(h_B);
free(h_C);
CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
return ok ? 0 : 1;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment