Created
January 27, 2023 12:15
-
-
Save Ediolot/bee2e3941f16c7af6ef68d49732559de to your computer and use it in GitHub Desktop.
GPU and CPU performance test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <chrono> | |
__global__ void vectorAdd(const float *A, const float *B, float *C, uint32_t N) { | |
const uint32_t threads_per_block = blockDim.x; | |
const uint32_t total_blocks = gridDim.x; | |
const uint32_t block_id = blockIdx.x; | |
const uint32_t thread_id = threadIdx.x; | |
const uint32_t total_threads = total_blocks * threads_per_block; | |
const uint32_t idx = block_id * threads_per_block + thread_id; | |
const uint32_t warp_id = thread_id / 32; | |
const uint32_t thread_id_inside_warp = thread_id % 32; | |
for (uint32_t i = idx; i < N; i += total_threads) { | |
for (uint32_t j = 0; j < 100; ++j) { | |
C[i] += A[(i + j) % N] * B[j]; | |
} | |
} | |
} | |
int main() { | |
float ms; | |
uint32_t M = 100; | |
uint32_t N = 100; | |
printf("%u,", N); | |
// Create host vectors | |
auto *h_A = new float[N]; | |
auto *h_B = new float[N]; | |
auto *h_C = new float[N]; | |
// Initialize host vectors | |
for (int i = 0; i < N; i++) { | |
h_A[i] = 1.0f; | |
h_C[i] = 0; | |
} | |
for (int i = 0; i < M; i++) { | |
h_B[i] = 2.0f; | |
} | |
// Create device vectors | |
float *d_A, *d_B, *d_C; | |
cudaMalloc(&d_A, N * sizeof(float)); | |
cudaMalloc(&d_B, M * sizeof(float)); | |
cudaMalloc(&d_C, N * sizeof(float)); | |
// Copy host vectors to device and measure time taken | |
cudaEvent_t start, stop; | |
cudaEventCreate(&start); | |
cudaEventCreate(&stop); | |
cudaEventRecord(start); | |
cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice); | |
cudaMemcpy(d_B, h_B, M * sizeof(float), cudaMemcpyHostToDevice); | |
cudaEventRecord(stop); | |
cudaEventSynchronize(stop); | |
cudaEventElapsedTime(&ms, start, stop); | |
printf("%f,", ms); | |
// Launch kernel and measure time taken in CUDA events | |
cudaEventCreate(&start); | |
cudaEventCreate(&stop); | |
cudaEventRecord(start); | |
vectorAdd<<<256, 256>>>(d_A, d_B, d_C, N); | |
cudaEventRecord(stop); | |
cudaEventSynchronize(stop); | |
cudaEventElapsedTime(&ms, start, stop); | |
printf("%f,", ms); | |
// Copy device vector to host and measure time taken | |
cudaEventCreate(&start); | |
cudaEventCreate(&stop); | |
cudaEventRecord(start); | |
cudaMemcpy(h_C, d_C, N * sizeof(float), cudaMemcpyDeviceToHost); | |
cudaEventRecord(stop); | |
cudaEventSynchronize(stop); | |
cudaEventElapsedTime(&ms, start, stop); | |
printf("%f,", ms); | |
// Run the same operation on CPU while measuring time with std::chrono as milliseconds | |
auto start_cpu = std::chrono::high_resolution_clock::now(); | |
for (int i = 0; i < N; i++) { | |
for (int j = 0; j < M; ++j) { | |
h_C[i] += h_A[(i + j) % N] * h_B[j]; | |
} | |
} | |
auto end_cpu = std::chrono::high_resolution_clock::now(); | |
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_cpu - start_cpu); | |
ms = float(duration.count()) * 1e-3f; | |
printf("%f\n", ms); | |
// Free memory | |
cudaFree(d_A); | |
cudaFree(d_B); | |
cudaFree(d_C); | |
delete[] h_A; | |
delete[] h_B; | |
delete[] h_C; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment