Created
February 17, 2021 05:22
-
-
Save sandeepkumar-skb/2fff5f85c2772c0955ad9ae6149c998f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <chrono> | |
#define BLOCK_SIZE 256 | |
inline void gpuAssert(cudaError_t err, const char *file, int line) | |
{ | |
if (err != cudaSuccess){ | |
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line); | |
exit(EXIT_FAILURE); | |
} | |
} | |
#define gpuErrchk(ans) \ | |
{ \ | |
gpuAssert((ans), __FILE__, __LINE__); \ | |
} | |
void cpu_all_reduce(int* sum, int* data, int n){ | |
int temp_sum = 0; | |
for (int i=0; i<n; ++i){ | |
temp_sum += data[i]; | |
} | |
*sum = temp_sum; | |
} | |
__global__ | |
void gpu_all_reduce_global(int* sum, int* data, int n){ | |
int idx = blockDim.x * blockIdx.x + threadIdx.x; | |
for(int s=blockDim.x/2; s>0; s/=2){ | |
if(threadIdx.x < s){ | |
data[idx] += data[idx+s]; | |
} | |
__syncthreads(); | |
} | |
if (threadIdx.x == 0){ | |
atomicAdd(sum, data[idx]); | |
} | |
} | |
void init(int *data, int size){ | |
for (int i=0; i<size; ++i){ | |
data[i] = 2; | |
} | |
} | |
int main(){ | |
int n = 1 << 24; | |
// execution configuration | |
dim3 blockSize (BLOCK_SIZE, 1, 1); | |
dim3 nBlocks ((n + BLOCK_SIZE -1)/ BLOCK_SIZE, 1, 1); | |
// cpu variables for golden model | |
int *cpu_sum = new int; | |
*cpu_sum = 0; | |
int *input_data; | |
// variables for cuda model | |
int *gpu_sum; | |
gpuErrchk(cudaMallocManaged(&gpu_sum, sizeof(int))); | |
gpuErrchk(cudaMallocManaged(&input_data, n*sizeof(int))); | |
init(input_data, n); | |
gpuErrchk(cudaMemset(gpu_sum, 0, sizeof(int))); | |
//CPU | |
cpu_all_reduce(cpu_sum, input_data, n); | |
init(input_data, n); | |
std::chrono::high_resolution_clock::time_point cpu_start = std::chrono::high_resolution_clock::now(); | |
cpu_all_reduce(cpu_sum, input_data, n); | |
init(input_data, n); | |
std::chrono::high_resolution_clock::time_point cpu_end = std::chrono::high_resolution_clock::now(); | |
std::chrono::duration<double> cpu_span = cpu_end - cpu_start; | |
//GPU | |
// Warmup | |
gpu_all_reduce_global<<<nBlocks, blockSize>>>(gpu_sum, input_data, n); | |
cudaDeviceSynchronize(); | |
cudaEvent_t gpu_start, gpu_stop; | |
gpuErrchk(cudaEventCreate(&gpu_start)); | |
gpuErrchk(cudaEventCreate(&gpu_stop)); | |
init(input_data, n); | |
cudaMemset(gpu_sum, 0, sizeof(int)); | |
gpuErrchk(cudaEventRecord(gpu_start)); | |
gpu_all_reduce_global<<<nBlocks, blockSize>>>(gpu_sum, input_data, n); | |
gpuErrchk(cudaEventRecord(gpu_stop)); | |
gpuErrchk(cudaEventSynchronize(gpu_stop)); | |
float milliseconds = 0; | |
gpuErrchk(cudaEventElapsedTime(&milliseconds, gpu_start, gpu_stop)); | |
if (*gpu_sum == *cpu_sum){ | |
std::cout << "cpu sum: " << *cpu_sum << std::endl; | |
std::cout << "gpu sum: " << *gpu_sum << std::endl; | |
std::cout << "cpu time: " << cpu_span.count()*1000 << "ms" << std::endl; | |
std::cout << "gpu time: " << milliseconds << "ms" << std::endl; | |
} | |
else{ | |
std::cout << "GPU and CPU results don't Match!!" << std::endl; | |
} | |
cudaFree(gpu_sum); | |
cudaFree(input_data); | |
delete cpu_sum; | |
return 0; | |
} |
Author
sandeepkumar-skb
commented
Feb 17, 2021
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment