Last active
April 9, 2021 20:56
-
-
Save wh5a/4500706 to your computer and use it in GitHub Desktop.
CUDA Work Efficient Parallel Scan.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// MP 5 Scan | |
// Given a list (lst) of length n | |
// Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ... + lst[n-1]} | |
// Due Tuesday, January 22, 2013 at 11:59 p.m. PST | |
#include <wb.h> | |
#define BLOCK_SIZE 512 //@@ You can change this | |
#define wbCheck(stmt) do { \ | |
cudaError_t err = stmt; \ | |
if (err != cudaSuccess) { \ | |
wbLog(ERROR, "Failed to run stmt ", #stmt); \ | |
return -1; \ | |
} \ | |
} while(0) | |
__global__ void fixup(float *input, float *aux, int len) { | |
unsigned int t = threadIdx.x, start = 2 * blockIdx.x * BLOCK_SIZE; | |
if (blockIdx.x) { | |
if (start + t < len) | |
input[start + t] += aux[blockIdx.x - 1]; | |
if (start + BLOCK_SIZE + t < len) | |
input[start + BLOCK_SIZE + t] += aux[blockIdx.x - 1]; | |
} | |
} | |
__global__ void scan(float * input, float * output, float *aux, int len) { | |
// Load a segment of the input vector into shared memory | |
__shared__ float scan_array[BLOCK_SIZE << 1]; | |
unsigned int t = threadIdx.x, start = 2 * blockIdx.x * BLOCK_SIZE; | |
if (start + t < len) | |
scan_array[t] = input[start + t]; | |
else | |
scan_array[t] = 0; | |
if (start + BLOCK_SIZE + t < len) | |
scan_array[BLOCK_SIZE + t] = input[start + BLOCK_SIZE + t]; | |
else | |
scan_array[BLOCK_SIZE + t] = 0; | |
__syncthreads(); | |
// Reduction | |
int stride; | |
for (stride = 1; stride <= BLOCK_SIZE; stride <<= 1) { | |
int index = (t + 1) * stride * 2 - 1; | |
if (index < 2 * BLOCK_SIZE) | |
scan_array[index] += scan_array[index - stride]; | |
__syncthreads(); | |
} | |
// Post reduction | |
for (stride = BLOCK_SIZE >> 1; stride; stride >>= 1) { | |
int index = (t + 1) * stride * 2 - 1; | |
if (index + stride < 2 * BLOCK_SIZE) | |
scan_array[index + stride] += scan_array[index]; | |
__syncthreads(); | |
} | |
if (start + t < len) | |
output[start + t] = scan_array[t]; | |
if (start + BLOCK_SIZE + t < len) | |
output[start + BLOCK_SIZE + t] = scan_array[BLOCK_SIZE + t]; | |
if (aux && t == 0) | |
aux[blockIdx.x] = scan_array[2 * BLOCK_SIZE - 1]; | |
} | |
int main(int argc, char ** argv) { | |
wbArg_t args; | |
float * hostInput; // The input 1D list | |
float * hostOutput; // The output list | |
float * deviceInput; | |
float * deviceOutput; | |
float *deviceAuxArray, *deviceAuxScannedArray; | |
int numElements; // number of elements in the list | |
args = wbArg_read(argc, argv); | |
wbTime_start(Generic, "Importing data and creating memory on host"); | |
hostInput = (float *) wbImport(wbArg_getInputFile(args, 0), &numElements); | |
cudaHostAlloc(&hostOutput, numElements * sizeof(float), cudaHostAllocDefault); | |
wbTime_stop(Generic, "Importing data and creating memory on host"); | |
wbLog(TRACE, "The number of input elements in the input is ", numElements); | |
wbTime_start(GPU, "Allocating GPU memory."); | |
wbCheck(cudaMalloc((void**)&deviceInput, numElements*sizeof(float))); | |
wbCheck(cudaMalloc((void**)&deviceOutput, numElements*sizeof(float))); | |
// XXX the size is fixed for ease of implementation. | |
cudaMalloc(&deviceAuxArray, (BLOCK_SIZE << 1) * sizeof(float)); | |
cudaMalloc(&deviceAuxScannedArray, (BLOCK_SIZE << 1) * sizeof(float)); | |
wbTime_stop(GPU, "Allocating GPU memory."); | |
wbTime_start(GPU, "Clearing output memory."); | |
wbCheck(cudaMemset(deviceOutput, 0, numElements*sizeof(float))); | |
wbTime_stop(GPU, "Clearing output memory."); | |
wbTime_start(GPU, "Copying input memory to the GPU."); | |
wbCheck(cudaMemcpy(deviceInput, hostInput, numElements*sizeof(float), cudaMemcpyHostToDevice)); | |
wbTime_stop(GPU, "Copying input memory to the GPU."); | |
//@@ Initialize the grid and block dimensions here | |
int numBlocks = ceil((float)numElements/(BLOCK_SIZE<<1)); | |
dim3 dimGrid(numBlocks, 1, 1); | |
dim3 dimBlock(BLOCK_SIZE, 1, 1); | |
wbLog(TRACE, "The number of blocks is ", numBlocks); | |
wbTime_start(Compute, "Performing CUDA computation"); | |
//@@ Modify this to complete the functionality of the scan | |
//@@ on the deivce | |
scan<<<dimGrid, dimBlock>>>(deviceInput, deviceOutput, deviceAuxArray, numElements); | |
cudaDeviceSynchronize(); | |
scan<<<dim3(1,1,1), dimBlock>>>(deviceAuxArray, deviceAuxScannedArray, NULL, BLOCK_SIZE << 1); | |
cudaDeviceSynchronize(); | |
fixup<<<dimGrid, dimBlock>>>(deviceOutput, deviceAuxScannedArray, numElements); | |
cudaDeviceSynchronize(); | |
wbTime_stop(Compute, "Performing CUDA computation"); | |
wbTime_start(Copy, "Copying output memory to the CPU"); | |
wbCheck(cudaMemcpy(hostOutput, deviceOutput, numElements*sizeof(float), cudaMemcpyDeviceToHost)); | |
wbTime_stop(Copy, "Copying output memory to the CPU"); | |
wbTime_start(GPU, "Freeing GPU Memory"); | |
cudaFree(deviceInput); | |
cudaFree(deviceOutput); | |
cudaFree(deviceAuxArray); | |
cudaFree(deviceAuxScannedArray); | |
wbTime_stop(GPU, "Freeing GPU Memory"); | |
wbSolution(args, hostOutput, numElements); | |
free(hostInput); | |
cudaFreeHost(hostOutput); | |
return 0; | |
} |
The code above by @rchoetzlein contains an error in how it calculates the block counts. Specifically:
int naux = SCAN_BLOCKSIZE << 1;
should be:
int naux = SCAN_BLOCKSIZE;
Is this kernel only fit to ONE Block? Because it used shared memory and no other operations work on the post reductioin results. Meanwhile, the number of elements in vector is limited to 512 float ? Is it correct?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Yes, Tom-Demijohn is right, there are a few bugs in the above code.
The main issue is this line in fixup() is incorrect (#22 and #24)
input[start + t] += aux[blockIdx.x - 1];
There should be no -1 here.
More generally, a limitation of the above is that it only works when there is one thread block on the auxiliary array. Since the auxiliary scan step (second kernel launch) throws away its last element sums, the auxiliary cannot be fixed up. This means the maximum number of elements that can be scanned for above code is BLOCK_SIZE^2. On CUDA hardware, with a limit of 512 threads/block, the maximum is 262,144 (not even a million).
The solution is to create an auxiliary-of-the-auxiliary. This would allow BLOCK_SIZE^3 elements, or 134 million on CUDA hardware.
I've provided the answer here, but doing this is not trivial. The code above returns proper prefix sums (where the first element of the output equals the first element of the input), but any auxiliary scans must do zero-offset scans (where the first element of output is zero, and all others are shifted down one), or the fixup of the auxiliaries will be incorrect.
Therefore, I've created kernels which enable or disable zero-offset scans selectively at each level, and added another layer of scan & auxiliary so that the maximum limit is BLOCK_SIZE^3.
Differences from the above code are marked with "<----"
Host code (three layers):
This has been tested with up to several million elements, in both classic and zero-offset mode.
Note: Zero-offset scans are useful in applications like stream compaction and radix-sort, where the input counts represent the # of elements in a bin, and you want the scan to return the offsets of the bins.
bin counts = {8, 4, 5, 3, 2}
offsets scan = {0, 8, 12, 17, 20} // <-- note offset(0)=0, and scan elements are shifted right.
classic scan = {8, 12, 17, 20, 22}
If you attempted to use classic-scan directly for bin offsets, you would incorrectly reserve 4 slots for the first bin instead of 8.