Skip to content

Instantly share code, notes, and snippets.

@KellenSunderland
Created February 14, 2018 08:47
Show Gist options
  • Save KellenSunderland/c5ea163dbde5b47bf5ff0ab848687138 to your computer and use it in GitHub Desktop.
Save KellenSunderland/c5ea163dbde5b47bf5ff0ab848687138 to your computer and use it in GitHub Desktop.
Reduce Test
#include <iostream>
#include <cuda_runtime.h>
#include <cstring>
#include <chrono>
int gpu_reduce(int size, const dim3 &block, const dim3 &grid, size_t bytes, int *h_idata, int *h_odata,
int *d_idata, int *d_odata);
void cpu_reduce(int size, int *h_idata, int &cpu_sum) {
cpu_sum= 0;
for (int i = 0; i < size; i++) {
cpu_sum += h_idata[i];
}
}
int main(int argc, char **argv) {
int dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("%s starting reduction at ", argv[0]);
printf("device %d: %s ", dev, deviceProp.name);
cudaSetDevice(dev);
bool bResult = false;
// Initialization.
int size = 1<<24; // Total number of elements to reduce.
printf("with array size %d ", size);
// Execution configuration.
int blocksize = 512;
if (argc > 1) {
blocksize = atoi(argv[1]);
}
dim3 block(blocksize, 1);
dim3 grid ((size + block.x-1) / block.x, 1);
printf("grid %d block %d\n", grid.x, block.x);
// Allocate host memory
size_t bytes = size * sizeof(int);
auto h_idata = (int *) malloc(bytes);
auto h_odata = (int *) malloc(grid.x * sizeof(int));
auto tmp = (int *) malloc(bytes);
// Initialize the array
for (int i =0; i < size; i++) {
h_idata[i] = (int) (rand() & 0xFF);
}
memcpy(tmp, h_idata, bytes);
size_t iStart, iElaps;
int gpu_sum = 0;
// Allocate device memory.
int *d_idata = nullptr;
int *d_odata = nullptr;
cudaMalloc((void **) &d_idata, bytes);
cudaMalloc((void **) &d_odata, grid.x * sizeof(int));
int cpu_sum;
auto start = std::chrono::_V2::steady_clock::now();
cpu_reduce(size, h_idata, cpu_sum);
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start);// Cpu reduction
printf("cpu reduce elapsed %li ms cpu_sum: %d\n", duration.count(), cpu_sum);
// Warm up.
for (int i = 0; i < 5; i++) {
gpu_sum = 0;
gpu_reduce(size, block, grid, bytes, h_idata, h_odata, d_idata, d_odata);
}
// GPU Reduce.
start = std::chrono::_V2::steady_clock::now();
gpu_sum = gpu_reduce(size, block, grid, bytes, h_idata, h_odata, d_idata, d_odata);
duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::_V2::steady_clock::now() - start);
printf("gpu reduce elapsed %li ms gpu_sum: %d\n", duration.count(), gpu_sum);
// Free host memory.
free(h_idata);
free(h_odata);
// Free device memory.
cudaFree(d_idata);
cudaFree(d_odata);
// Reset device.
cudaDeviceReset();
// Check the results
bResult = (gpu_sum == cpu_sum);
if (!bResult) printf("Test failed");
return EXIT_SUCCESS;
}
//
// Created by kellen on 11.02.18.
//
// Interleaved Pair Implementation
#include <stdio.h>
__global__ void interleaved(int *g_idata, int *g_odata, unsigned int n) {
// Set thread ids.
unsigned int tid = threadIdx.x;
unsigned int idx = blockDim.x * blockIdx.x + tid;
// Convert global data pointer to local block data pointer.
int *idata = g_idata + blockIdx.x * blockDim.x;
// Boundary check.
if (idx >= n) return;
// In place reduction in global memory.
for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
if (tid < stride) {
idata[tid] += idata[tid + stride];
}
__syncthreads();
}
// Write results to global memory.
if (tid == 0) g_odata[blockIdx.x] = idata[0];
}
int gpu_reduce(int size, const dim3 &block, const dim3 &grid, size_t bytes, int *h_idata, int *h_odata, int *d_idata,
int *d_odata) {
int gpu_sum = 0;
cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
interleaved<<<grid, block>>>(d_idata, d_odata, size);
cudaDeviceSynchronize();
cudaMemcpy(h_odata, d_odata, grid.x*sizeof(int), cudaMemcpyDeviceToHost);
for (int i =0; i < grid.x; i++) {
gpu_sum += h_odata[i];
}
return gpu_sum;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment