Last active
September 24, 2016 12:20
-
-
Save allanmac/6160110 to your computer and use it in GitHub Desktop.
Inclusive and exclusive warp-level scan snippets. Evaluating SHFL vs. shared implementations. Also evaluating the simplest transformation of an inclusive scan into an exclusive scan. It's only two ops on sm_3x.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
// | |
// | |
// | |
#define WARP_SIZE 32 | |
#define VOLATILE volatile | |
#define KERNEL_QUALIFIERS extern "C" __global__ | |
#define DEVICE_FUNCTION_QUALIFIERS __device__ | |
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__ | |
// | |
// | |
// | |
VOLATILE __shared__ struct | |
{ | |
#if __CUDA_ARCH__ < 300 | |
unsigned int scratch[WARP_SIZE]; | |
#endif | |
} shared; | |
// | |
// | |
// | |
DEVICE_INTRINSIC_QUALIFIERS | |
unsigned int laneId() | |
{ | |
unsigned int id; | |
asm("mov.u32 %0, %%laneid;" : "=r"(id)); | |
return id; | |
} | |
DEVICE_INTRINSIC_QUALIFIERS | |
unsigned int laneMaskEQ() | |
{ | |
#if __CUDA_ARCH__ >= 200 | |
unsigned int id; | |
asm("mov.u32 %0, %%lanemask_eq;" : "=r"(id)); | |
return id; | |
#else | |
return 1u << laneId(); | |
#endif | |
} | |
// | |
// | |
// | |
/** | |
* Convert a warp-level inclusive scan to an exclusive scan by | |
* shifting the lanes to the right and assigning an 'identity' value | |
* to lane 0. | |
* | |
* @param v | |
* | |
* @return scan result for lane | |
*/ | |
#if (__CUDA_ARCH__ >= 300) | |
DEVICE_FUNCTION_QUALIFIERS | |
unsigned int | |
toExclusiveScan(unsigned int v, const unsigned int i) | |
{ | |
asm("{ \n\t" | |
" .reg .pred p; \n\t" | |
" shfl.up.b32 %0|p, %0, 0x1, 0x0; \n\t" | |
" @!p mov.u32 %0, %1; \n\t" | |
"}" | |
: "+r"(v) : "r"(i)); | |
return v; | |
} | |
#endif | |
// | |
// | |
// | |
/** | |
* Warp-level "inclusive plus scan". | |
* | |
* PTX from PTX ISA PDF | |
* | |
* @param v | |
* | |
* @return scan result for lane | |
*/ | |
DEVICE_FUNCTION_QUALIFIERS | |
unsigned int | |
plusScan(unsigned int v, const bool inclusive) | |
{ | |
#if (__CUDA_ARCH__ >= 300) | |
asm("{ \n\t" | |
" .reg .u32 t; \n\t" | |
" .reg .pred p; \n\t" | |
" shfl.up.b32 t|p, %0, 0x1, 0x0; \n\t" | |
" @p add.u32 %0, t, %0; \n\t" | |
" shfl.up.b32 t|p, %0, 0x2, 0x0; \n\t" | |
" @p add.u32 %0, t, %0; \n\t" | |
" shfl.up.b32 t|p, %0, 0x4, 0x0; \n\t" | |
" @p add.u32 %0, t, %0; \n\t" | |
" shfl.up.b32 t|p, %0, 0x8, 0x0; \n\t" | |
" @p add.u32 %0, t, %0; \n\t" | |
" shfl.up.b32 t|p, %0, 0x10, 0x0; \n\t" | |
" @p add.u32 %0, t, %0; \n\t" | |
"}" | |
: "+r"(v)); | |
if (inclusive) | |
return v; | |
else | |
return toExclusiveScan(v,0u); | |
#else | |
/* | |
// | |
// uncomment if you want to mask redundant shared stores | |
// | |
#define STORE_IF_LT_WARP_MINUS(l) \ | |
if (lid < WARP_SIZE-l) \ | |
scratch[0] = v | |
*/ | |
#define STORE_IF_LT_WARP_MINUS(l) \ | |
scratch[0] = v | |
const unsigned int lid = laneId(); | |
volatile unsigned int* scratch = shared.scratch + lid; | |
if (inclusive) | |
{ | |
scratch[0] = v; | |
} | |
else | |
{ | |
if (lid == (WARP_SIZE-1)) | |
scratch[-31] = 0u; | |
else | |
scratch[1] = v; | |
} | |
v = scratch[0]; | |
if (lid >= 1) | |
{ | |
v = v + scratch[-1]; | |
STORE_IF_LT_WARP_MINUS(2); | |
if (lid >= 2) | |
{ | |
v = v + scratch[-2]; | |
STORE_IF_LT_WARP_MINUS(4); | |
if (lid >= 4) | |
{ | |
v = v + scratch[-4]; | |
STORE_IF_LT_WARP_MINUS(8); | |
if (lid >= 8) | |
{ | |
v = v + scratch[-8]; | |
STORE_IF_LT_WARP_MINUS(16); | |
if (lid >= 16) | |
v = v + scratch[-16]; | |
} | |
} | |
} | |
} | |
return v; | |
#endif | |
} | |
// | |
// | |
// | |
KERNEL_QUALIFIERS | |
void inclusivePlusScanKernel(const unsigned int* const vin, | |
unsigned int* const vout) | |
{ | |
unsigned int v = vin[threadIdx.x]; | |
v = plusScan(v,true); | |
vout[threadIdx.x] = v; | |
} | |
// | |
// | |
// | |
KERNEL_QUALIFIERS | |
void exclusivePlusScanKernel(const unsigned int* const vin, | |
unsigned int* const vout) | |
{ | |
unsigned int v = vin[threadIdx.x]; | |
v = plusScan(v,false); | |
vout[threadIdx.x] = v; | |
} | |
// | |
// | |
// | |
void printScan(const char* const msg, | |
const unsigned int* const warp) | |
{ | |
printf("%6s:",msg); | |
for (int ii=0; ii<WARP_SIZE; ii++) | |
printf("%2d ",warp[ii]); | |
printf("\n"); | |
} | |
// | |
// | |
// | |
int main(int argc, char** argv) | |
{ | |
// scan [device] [0=exclusive] -- otherwise defaults to inclusive | |
const int device = (argc >= 2) ? atoi(argv[1]) : 0; | |
const bool inclusive = (argc == 3) ? atoi(argv[2]) != 0 : true; | |
cudaDeviceProp props; | |
cudaGetDeviceProperties(&props,device); | |
printf("%s (%2d)\n",props.name,props.multiProcessorCount); | |
printf("%s scan ...\n",inclusive ? "inclusive" : "exclusive"); | |
cudaSetDevice(device); | |
// | |
// LAUNCH KERNEL | |
// | |
unsigned int* vin; | |
unsigned int* vout; | |
cudaMalloc(&vin, sizeof(unsigned int) * WARP_SIZE); | |
cudaMalloc(&vout,sizeof(unsigned int) * WARP_SIZE); | |
// | |
// | |
// | |
unsigned int win[32] = | |
{ | |
1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, | |
1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1 | |
}; | |
cudaMemcpy(vin,win,sizeof(unsigned int) * WARP_SIZE,cudaMemcpyHostToDevice); | |
// | |
// | |
// | |
if (inclusive) | |
inclusivePlusScanKernel<<<1,WARP_SIZE>>>(vin,vout); | |
else | |
exclusivePlusScanKernel<<<1,WARP_SIZE>>>(vin,vout); | |
cudaDeviceSynchronize(); | |
// | |
// | |
// | |
unsigned int wout[32]; | |
cudaMemcpy(wout,vout,sizeof(unsigned int) * WARP_SIZE,cudaMemcpyDeviceToHost); | |
printScan("warp",win); | |
printScan("scan",wout); | |
// | |
// | |
// | |
cudaFree(vin); | |
cudaFree(vout); | |
cudaDeviceReset(); | |
return 0; | |
} |
The SASS for a warp-level inclusive plus scan with SHFL is:
code for sm_35
Function : inclusivePlusScanKernel
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x08eca0ecdc10a0a0 */
/*0008*/ S2R R0, SR_TID.X; /* 0x86400000109c0002 */
/*0010*/ ISCADD R1, R0, c[0x0][0x140], 0x2; /* 0x60c00800281c0006 */
/*0018*/ LD R1, [R1]; /* 0xc4000000001c0404 */
/*0020*/ ISCADD R0, R0, c[0x0][0x144], 0x2; /* 0x60c00800289c0002 */
/*0028*/ SHFL.UP P0, R2, R1, 0x1, 0x0; /* 0x78800003809c040a */
/*0030*/ @P0 IADD R1, R2, R1; /* 0xe080000000800806 */
/*0038*/ SHFL.UP P0, R2, R1, 0x2, 0x0; /* 0x78800003811c040a */
/* 0x08eca0a0dca0eca0 */
/*0048*/ @P0 IADD R1, R2, R1; /* 0xe080000000800806 */
/*0050*/ SHFL.UP P0, R2, R1, 0x4, 0x0; /* 0x78800003821c040a */
/*0058*/ @P0 IADD R1, R2, R1; /* 0xe080000000800806 */
/*0060*/ SHFL.UP P0, R2, R1, 0x8, 0x0; /* 0x78800003841c040a */
/*0068*/ IADD R2, R2, R1; /* 0xe0800000009c080a */
/*0070*/ SEL R1, R2, R1, P0; /* 0xe5000000009c0806 */
/*0078*/ SHFL.UP P0, R2, R1, 0x10, 0x0; /* 0x78800003881c040a */
/* 0x0800000000b810a0 */
/*0088*/ @P0 IADD R1, R2, R1; /* 0xe080000000800806 */
/*0090*/ ST [R0], R1; /* 0xe4000000001c0004 */
/*0098*/ EXIT ; /* 0x18000000001c003c */
/*00a0*/ BRA 0xa0; /* 0x12007ffffc1c003c */
The SASS for a warp-level inclusive plus scan using shared is:
code for sm_21
Function : inclusivePlusScanKernel
.headerflags @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ S2R R0, SR_TID.X; /* 0x2c00000084001c04 */
/*0008*/ SHL R2, R0, 0x2; /* 0x6000c00008009c03 */
/*0010*/ S2R R3, SR_LANEID; /* 0x2c0000000000dc04 */
/*0018*/ IADD R0, R2, c[0x0][0x20]; /* 0x4800400080201c03 */
/*0020*/ SHL R4, R3, 0x2; /* 0x6000c00008311c03 */
/*0028*/ ISETP.EQ.AND P0, PT, R3, RZ, PT; /* 0x190e0000fc31dc23 */
/*0030*/ LD R1, [R0]; /* 0x8000000000005c85 */
/*0038*/ SSY 0x108; /* 0x6000000320000007 */
/*0040*/ STS [R4], R1; /* 0xc900000000405c85 */
/*0048*/ LDS R0, [R4]; /* 0xc100000000401c85 */
/*0050*/ @P0 NOP.S; /* 0x40000000000001f4 */
/*0058*/ ISETP.LT.U32.AND P0, PT, R3, 0x2, PT; /* 0x188ec0000831dc03 */
/*0060*/ LDS R1, [R4+-0x4]; /* 0xc103fffff0405c85 */
/*0068*/ IADD R0, R1, R0; /* 0x4800000000101c03 */
/*0070*/ STS [R4], R0; /* 0xc900000000401c85 */
/*0078*/ @P0 NOP.S; /* 0x40000000000001f4 */
/*0080*/ ISETP.LT.U32.AND P0, PT, R3, 0x4, PT; /* 0x188ec0001031dc03 */
/*0088*/ LDS R1, [R4+-0x8]; /* 0xc103ffffe0405c85 */
/*0090*/ IADD R0, R1, R0; /* 0x4800000000101c03 */
/*0098*/ STS [R4], R0; /* 0xc900000000401c85 */
/*00a0*/ @P0 NOP.S; /* 0x40000000000001f4 */
/*00a8*/ ISETP.LT.U32.AND P0, PT, R3, 0x8, PT; /* 0x188ec0002031dc03 */
/*00b0*/ LDS R1, [R4+-0x10]; /* 0xc103ffffc0405c85 */
/*00b8*/ IADD R0, R1, R0; /* 0x4800000000101c03 */
/*00c0*/ STS [R4], R0; /* 0xc900000000401c85 */
/*00c8*/ @P0 NOP.S; /* 0x40000000000001f4 */
/*00d0*/ ISETP.LT.U32.AND P0, PT, R3, 0x10, PT; /* 0x188ec0004031dc03 */
/*00d8*/ LDS R1, [R4+-0x20]; /* 0xc103ffff80405c85 */
/*00e0*/ IADD R0, R1, R0; /* 0x4800000000101c03 */
/*00e8*/ STS [R4], R0; /* 0xc900000000401c85 */
/*00f0*/ @P0 NOP.S; /* 0x40000000000001f4 */
/*00f8*/ LDS R1, [R4+-0x40]; /* 0xc103ffff00405c85 */
/*0100*/ IADD.S R0, R1, R0; /* 0x4800000000101c13 */
/*0108*/ IADD R1, R2, c[0x0][0x24]; /* 0x4800400090205c03 */
/*0110*/ ST [R1], R0; /* 0x9000000000101c85 */
/*0118*/ EXIT ; /* 0x8000000000001de7 */
inclusive:
> scan 0 1
Tesla K20c (13)
inclusive scan ...
warp: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
scan: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
exclusive:
> scan 0 0
Tesla K20c (13)
exclusive scan ...
warp: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
scan: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Compiled with: