Created
April 17, 2017 13:41
-
-
Save killeent/4e8ecbc396f111f031b1037d09cfa119 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef THC_REDUCE_APPLY_UTILS_INC | |
#define THC_REDUCE_APPLY_UTILS_INC | |
#include <algorithm> | |
#include <cuda.h> | |
#include <assert.h> | |
#include "THCGeneral.h" | |
#include "THCTensor.h" | |
#include "THCDeviceUtils.cuh" | |
#include "THCTensorInfo.cuh" | |
#include "THCAsmUtils.cuh" | |
// Enum that indicates whether tensor arguments are read/write or | |
// read-only | |
enum TensorArgType { ReadWrite, ReadOnly }; | |
template <typename IndexType> | |
__device__ __forceinline__ IndexType getLinearBlockId() { | |
return blockIdx.z * gridDim.y * gridDim.x + | |
blockIdx.y * gridDim.x + | |
blockIdx.x; | |
} | |
// Returns the minimum size of shared memory required of performing N reductions | |
// across a block, with each reduction having at most numVals individual elements | |
// to reduce. | |
template <typename T, int N> | |
int reduceSmemSize(THCState *state, long numVals) { | |
// check if we can use a warp shuffle | |
if (state->deviceProperties[0].major >= 3) { | |
printf("new\n"); | |
return state->deviceProperties[0].warpSize * N * sizeof(T); | |
} else { | |
printf("old\n"); | |
return std::min(numVals, 1024L) * N * sizeof(T); | |
} | |
} | |
template <typename T> | |
struct THCWarpUtils { | |
static __device__ __forceinline__ T shfldown(T val, unsigned int offset) { | |
return __shfl_down(val, offset); | |
} | |
}; | |
template <> | |
struct THCWarpUtils<unsigned char> { | |
static __device__ __forceinline__ unsigned char shfldown(unsigned char val, unsigned int offset) { | |
return (unsigned char) __shfl_down((int) val, offset); | |
} | |
}; | |
template <> | |
struct THCWarpUtils<char> { | |
static __device__ __forceinline__ char shfldown(char val, unsigned int offset) { | |
return (char) __shfl_down((int) val, offset); | |
} | |
}; | |
template <> | |
struct THCWarpUtils<short> { | |
static __device__ __forceinline__ short shfldown(short val, unsigned int offset) { | |
return (short) __shfl_down((int) val, offset); | |
} | |
}; | |
template <> | |
struct THCWarpUtils<double> { | |
static __device__ __forceinline__ double shfldown(double val, unsigned int offset) { | |
int2 a = *reinterpret_cast<int2*>(&val); | |
a.x = __shfl_down(a.x, offset); | |
a.y = __shfl_down(a.y, offset); | |
return *reinterpret_cast<double*>(&a); | |
} | |
}; | |
template <> | |
struct THCWarpUtils<long> { | |
static __device__ __forceinline__ long shfldown(long val, unsigned int offset) { | |
int2 a = *reinterpret_cast<int2*>(&val); | |
a.x = __shfl_down(a.x, offset); | |
a.y = __shfl_down(a.y, offset); | |
return *reinterpret_cast<long*>(&a); | |
} | |
}; | |
template <typename T, typename ReduceOp, int N> | |
__device__ void warpReduce(T threadVals[N], ReduceOp reduceOp) { | |
#pragma unroll | |
for (int offset = warpSize / 2; offset > 0; offset /= 2) { | |
#pragma unroll | |
for (int i = 0; i < N; ++i) { | |
T neighbor = THCWarpUtils<T>::shfldown(threadVals[i], offset); | |
threadVals[i] = reduceOp(threadVals[i], neighbor); | |
} | |
} | |
} | |
template <typename T, typename ReduceOp, int N> | |
__device__ void warpReduceBlock(T *smem, T threadVals[N], int numVals, ReduceOp reduceOp, T init) { | |
// First, warps cooperate to reduce values within the warp | |
warpReduce<T, ReduceOp, N>(threadVals, reduceOp); | |
int lane = getLaneId(); | |
int warp = threadIdx.x / warpSize; | |
if (lane == 0) { | |
#pragma unroll | |
for (int i = 0; i < N; ++i) { | |
smem[warp + (i * warpSize)] = threadVals[i]; | |
} | |
} | |
__syncthreads(); | |
#pragma unroll | |
for (int i = 0; i < N; ++i) { | |
threadVals[i] = (threadIdx.x < (blockDim.x / warpSize)) ? smem[lane + (i * warpSize)] : init; | |
} | |
if (warp == 0) { | |
warpReduce<T, ReduceOp, N>(threadVals, reduceOp); | |
} | |
} | |
// Reduce N values concurrently, i.e. suppose N = 2, and there are 4 threads: | |
// (1, 2), (3, 4), (5, 6), (7, 8), then the return in threadVals for thread 0 | |
// is (1 + 3 + 5 + 7, 2 + 4 + 6 + 8) = (16, 20) | |
template <typename T, typename ReduceOp, int N> | |
__device__ void reduceNValuesInBlock(T *smem, | |
T threadVals[N], | |
int numVals, | |
ReduceOp reduceOp, | |
T init) { | |
if (numVals == 0) { | |
#pragma unroll | |
for (int i = 0; i < N; ++i) { | |
threadVals[i] = init; | |
} | |
return; | |
} | |
#if __CUDA_ARCH__ >= 300 | |
warpReduceBlock<T, ReduceOp, N>(smem, threadVals, numVals, reduceOp, init); | |
#else | |
// We store each of the N values contiguously, so if N = 2, all values for | |
// the first threadVal for each thread in the block are stored followed by | |
// all of the values for the second threadVal for each thread in the block | |
if (threadIdx.x < numVals) { | |
#pragma unroll | |
for (int i = 0; i < N; ++i) { | |
smem[i * numVals + threadIdx.x] = threadVals[i]; | |
} | |
} | |
__syncthreads(); | |
// Number of lanes in the final reduction --> this is used to determine | |
// where to put the outputs of each of the n things we are reducing. If | |
// nLP = 32, then we have the 32 outputs for the first threadVal, | |
// followed by the 32 outputs for the second threadVal, etc. | |
int numLanesParticipating = min(numVals, warpSize); | |
if (numVals > warpSize && ((threadIdx.x / warpSize) == 0 )) { | |
#pragma unroll | |
for (int i = 0; i < N; ++i) { | |
threadVals[i] = threadIdx.x < numVals ? threadVals[i] : init; | |
} | |
for (int i = warpSize + threadIdx.x; i < numVals; i += warpSize) { | |
#pragma unroll | |
for (int j = 0; j < N; ++j) { | |
threadVals[j] = reduceOp(threadVals[j], smem[j * numVals + i]); | |
} | |
} | |
#pragma unroll | |
for (int i = 0; i < N; ++i) { | |
smem[i * numLanesParticipating + threadIdx.x] = threadVals[i]; | |
} | |
} | |
__syncthreads(); | |
if (threadIdx.x == 0) { | |
if (numLanesParticipating == 32) { | |
#pragma unroll | |
for (int i = 0; i < N; ++i) { | |
#pragma unroll | |
for (int j = 1; j < 32; ++j) { | |
threadVals[i] = reduceOp(threadVals[i], smem[i * 32 + j]); | |
} | |
} | |
} else { | |
#pragma unroll | |
for (int i = 0; i < N; ++i) { | |
for (int j = 1; j < numLanesParticipating; ++j) { | |
threadVals[i] = reduceOp(threadVals[i], smem[i * numVals + j]); | |
} | |
} | |
} | |
} | |
#endif | |
} | |
// Block-wide reduction in shared memory helper; only threadIdx.x == 0 will | |
// return the reduced value | |
template <typename T, typename ReduceOp> | |
__device__ T reduceBlock(T* smem, | |
int numVals, | |
T threadVal, | |
ReduceOp reduceOp, | |
T init) { | |
reduceNValuesInBlock<T, ReduceOp, 1>(smem, &threadVal, numVals, reduceOp, init); | |
return threadVal; | |
} | |
// Block-wide reduction where each thread locally reduces N | |
// values before letting a single warp take over - assumes | |
// threadVals is in registers, not shared memory | |
template <typename T, typename ReduceOp, int N> | |
__device__ T reduceBlockWithNThreadLocalReductions(T *smem, | |
T threadVals[N], | |
int numVals, | |
ReduceOp reduceOp, | |
T init) { | |
int offset = threadIdx.x * N; | |
T local = offset < numVals ? threadVals[0] : init; | |
#pragma unroll | |
for (int i = 1; i < N; ++i) { | |
++offset; | |
T next = offset < numVals ? threadVals[i] : init; | |
local = reduceOp(local, next); | |
} | |
return reduceBlock<T, ReduceOp>(smem, blockDim.x < numVals ? blockDim.x : numVals, local, reduceOp, init); | |
} | |
// Make sure the given tensor doesn't have too many dimensions | |
void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg); | |
// Produces a grid with at least one point per tile | |
THC_API bool THC_getGridFromTiles(ptrdiff_t gridTiles, dim3& grid); | |
#endif // THC_REDUCE_APPLY_UTILS_INC |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment