Skip to content

Instantly share code, notes, and snippets.

@killeent
Created March 16, 2017 20:10
Show Gist options
  • Save killeent/de9ee574a5e7e03c5f38f958387181a1 to your computer and use it in GitHub Desktop.
Save killeent/de9ee574a5e7e03c5f38f958387181a1 to your computer and use it in GitHub Desktop.
// Block-wide reduction where each thread locally reduces N
// values before letting a single warp take over
template <typename T, typename ReduceOp, int N>
__device__ T reduceBlockN(T *smem,
int numVals,
ReduceOp reduceOp,
T init) {
T local = threadIdx.x < numVals ? smem[threadIdx.x] : init;
#pragma unroll
for (int i = 1; i < N; ++i) {
int index = threadIdx.x + (i * blockDim.x);
T next = index < numVals ? smem[index] : init;
local = reduceOp(local, next);
}
return reduceBlock<T, ReduceOp>(smem, blockDim.x < numVals ? blockDim.x : numVals, local, reduceOp, init);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment