Created
June 9, 2020 17:42
-
-
Save scott-gray/b52c2051b7f7da91994e497233188410 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// A case for making the compiler more threadIdx aware in conditional code. | |
// Proposed solution: | |
// Walk the dependacies of any predicate gating a shfl.sync to look for threadIdx. | |
// Simulate all 1024 values of threadIdx with full predicate expression to see if it's warp uniform. | |
// Or you can also check if only single thread is active for other opimizations (like in that atomic add). | |
// This can't be that complicated to do. | |
__device__ __forceinline__ float shfl_xor(float var, int laneMask) | |
{ | |
float ret; | |
asm ("shfl.sync.bfly.b32 %0, %1, %2, 0x1f, 0xffffffff;" : "=f"(ret) : "f"(var), "r"(laneMask)); | |
return ret; | |
} | |
__device__ __forceinline__ float sum4(float4 a) { return (a.x + a.y) + (a.z + a.w); } | |
__device__ __forceinline__ float cta_reduce_sum(float xsum) | |
{ | |
uint tid = threadIdx.x; | |
// reduce across the warp | |
// This shuffle also produces messy sass when cta_reduce_sum is called conditinally | |
for (int i = 16; i > 0; i >>= 1) | |
xsum += shfl_xor(xsum, i); | |
// if block is bigger than a warp, then reduce warps | |
if (blockDim.x > 32) | |
{ | |
__shared__ float Share[32]; | |
float4* Share4 = (float4*)Share; | |
// Init shared to zero if needed | |
if (blockDim.x != 1024) | |
{ | |
if (tid < 32) | |
Share[tid] = 0.0f; | |
__syncthreads(); | |
} | |
// store 1 warp reduced value to shared for each warp | |
if ((tid & 31) == 0) | |
Share[tid/32] = xsum; | |
__syncthreads(); | |
// This is the problem code here: | |
if (1) | |
{ | |
// warp uniform shuffle. | |
// compiler can't figure this out and generates messy branching code | |
if (tid < 32) | |
{ | |
// we could trim these shuffle ops depending on blockDim.x | |
// keep it simple for illustrative purposes | |
xsum = Share[tid]; | |
for (int i = 16; i > 0; i >>= 1) | |
xsum += shfl_xor(xsum, i); | |
} | |
} | |
else | |
{ | |
// Alternative using shared that's at least clean: | |
if (tid == 0) | |
{ | |
xsum = 0.0f; | |
#pragma unroll 1 | |
for (int j = 0, s = 0; j < blockDim.x; j += 256, s += 2) | |
xsum += sum4(Share4[s]) + sum4(Share4[s+1]); | |
*Share = xsum; | |
} | |
__syncthreads(); | |
xsum = *Share; | |
} | |
} | |
return xsum; | |
} | |
__device__ __forceinline__ uint store_partial(float* PartialSum, uint* PartialCnt, float partial) | |
{ | |
uint tid = threadIdx.x; | |
__shared__ uint Share[1]; | |
// You can try swapping out conditional here to confuse compiler | |
// if ((tid & 1023) == 0) | |
if (tid == 0) | |
{ | |
// store partial sum to global | |
PartialSum += blockIdx.x; | |
asm volatile ("st.relaxed.gpu.global.f32 [%0], %1;" :: "l"(PartialSum), "f"(partial) ); | |
// given stg, atom and ldgs are "strong" I believe this isn't needed to ensure ordering? | |
//__threadfence(); | |
// Count the number of stored partial sums | |
// Note here is a case that the compiler IS aware that there is only 1 thread active. | |
// Otherwise it would see the warp uniform address and multiply constant atomic add value by number of active threads. | |
// Try swapping out conditional above to see this. | |
uint count; | |
asm volatile ("atom.relaxed.gpu.global.add.u32 %0, [%1], 1;" : "=r"(count): "l"(PartialCnt) ); | |
*Share = count + 1; | |
} | |
__syncthreads(); | |
return *Share; | |
} | |
// Do a tensor wide sum squared reduction deterministically | |
__global__ void sum_squared_reduce(float* SumSquared, float* PartialSum, uint* PartialCnt, const float* X, uint size) | |
{ | |
uint tid = threadIdx.x; | |
uint bid = blockIdx.x; | |
float sum_squared = 0.0f; | |
// tile the reduction among blocks and compute partial sums | |
#pragma unroll 1 | |
for (uint offset = bid*blockDim.x + tid; offset < size; offset += gridDim.x*blockDim.x) | |
{ | |
float x = __ldg(X + offset); | |
sum_squared += x*x; | |
} | |
// reduce within this cta | |
sum_squared = cta_reduce_sum(sum_squared); | |
// store partial sum to global and check if we're the last block | |
uint partial_cnt = store_partial(PartialSum, PartialCnt, sum_squared); | |
// Note that this conditional is guaranteed to be warp uniform (no tid involved) but compiler is unable to deduce this. | |
if (partial_cnt == gridDim.x) | |
{ | |
// Last block completes the sum | |
sum_squared = 0.0f; | |
#pragma unroll 1 | |
for (uint offset = tid; offset <= gridDim.x; offset += blockDim.x) | |
{ | |
// Do these ldg's also need to be strong to ensure ordering? | |
float partial; | |
asm ("ld.relaxed.gpu.global.f32 %0, [%1];" : "=f"(partial): "l"(PartialSum + offset) ); | |
sum_squared += partial; | |
} | |
// Final cta reduction | |
// This time there's no way to avoid nasty shfl.sync branching because we're in the partial_cnt conditional | |
sum_squared = cta_reduce_sum(sum_squared); | |
// first thread outputs final reduction | |
if (tid == 0) | |
*SumSquared = sum_squared; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM70 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM70)" | |
.elftype @"ET_EXEC" | |
//--------------------- .text._Z18sum_squared_reducePfS_PjPKfj -------------------------- | |
.section .text._Z18sum_squared_reducePfS_PjPKfj,"ax",@progbits | |
.sectionflags @"SHF_BARRIERS=1" | |
.sectioninfo @"SHI_REGISTERS=14" | |
.align 128 | |
.global _Z18sum_squared_reducePfS_PjPKfj | |
.type _Z18sum_squared_reducePfS_PjPKfj,@function | |
.size _Z18sum_squared_reducePfS_PjPKfj,(.L_56 - _Z18sum_squared_reducePfS_PjPKfj) | |
.other _Z18sum_squared_reducePfS_PjPKfj,@"STO_CUDA_ENTRY STV_DEFAULT" | |
_Z18sum_squared_reducePfS_PjPKfj: | |
.text._Z18sum_squared_reducePfS_PjPKfj: | |
/*0000*/ MOV R1, c[0x0][0x28] ; | |
/*0010*/ @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ; | |
/*0020*/ S2R R3, SR_CTAID.X ; | |
/*0030*/ BMOV.32.CLEAR RZ, B0 ; | |
/*0040*/ BSSY B0, `(.L_1) ; | |
/*0050*/ IMAD.MOV.U32 R6, RZ, RZ, RZ ; | |
/*0060*/ S2R R0, SR_TID.X ; | |
/*0070*/ IMAD R2, R3, c[0x0][0x0], R0 ; | |
/*0080*/ ISETP.GE.U32.AND P0, PT, R2, c[0x0][0x180], PT ; | |
/*0090*/ @P0 BRA `(.L_2) ; | |
/*00a0*/ MOV R6, RZ ; | |
.L_3: | |
/*00b0*/ IMAD.MOV.U32 R5, RZ, RZ, 0x4 ; | |
/*00c0*/ IMAD.WIDE.U32 R4, R2, R5, c[0x0][0x178] ; | |
/*00d0*/ LDG.E.CONSTANT.SYS R5, [R4] ; | |
/*00e0*/ MOV R7, c[0x0][0xc] ; | |
/*00f0*/ IMAD R2, R7, c[0x0][0x0], R2 ; | |
/*0100*/ ISETP.GE.U32.AND P0, PT, R2, c[0x0][0x180], PT ; | |
/*0110*/ FFMA R6, R5, R5, R6 ; | |
/*0120*/ @!P0 BRA `(.L_3) ; | |
.L_2: | |
/*0130*/ BSYNC B0 ; | |
.L_1: | |
/*0140*/ SHFL.BFLY PT, R5, R6, 0x10, 0x1f ; | |
/*0150*/ IMAD.MOV.U32 R10, RZ, RZ, c[0x0][0x0] ; | |
/*0160*/ SHF.R.U32.HI R4, RZ, 0x3, R0 ; | |
/*0170*/ ISETP.GE.U32.AND P0, PT, R10, 0x21, PT ; | |
/*0180*/ LOP3.LUT R4, R4, 0x1ffffffc, RZ, 0xc0, !PT ; | |
/*0190*/ FADD R5, R5, R6 ; | |
/*01a0*/ SHFL.BFLY PT, R2, R5, 0x8, 0x1f ; | |
/*01b0*/ FADD R7, R5, R2 ; | |
/*01c0*/ SHFL.BFLY PT, R2, R7, 0x4, 0x1f ; | |
/*01d0*/ FADD R8, R7, R2 ; | |
/*01e0*/ SHFL.BFLY PT, R9, R8, 0x2, 0x1f ; | |
/*01f0*/ FADD R9, R8, R9 ; | |
/*0200*/ SHFL.BFLY PT, R2, R9, 0x1, 0x1f ; | |
/*0210*/ FADD R11, R9, R2 ; | |
/*0220*/ SHF.L.U32 R2, R0, 0x2, RZ ; | |
/*0230*/ @!P0 BRA `(.L_4) ; | |
/*0240*/ ISETP.NE.AND P2, PT, R10, 0x400, PT ; | |
/*0250*/ LOP3.LUT P1, RZ, R0, 0x1f, RZ, 0xc0, !PT ; | |
/*0260*/ @!P2 BRA `(.L_5) ; | |
/*0270*/ ISETP.GT.U32.AND P2, PT, R0, 0x1f, PT ; | |
/*0280*/ @!P2 STS [R2], RZ ; | |
/*0290*/ NOP ; | |
/*02a0*/ BAR.SYNC 0x0 ; | |
.L_5: | |
/*02b0*/ @!P1 STS [R4], R11 ; | |
/*02c0*/ NOP ; | |
/*02d0*/ BAR.SYNC 0x0 ; | |
/*02e0*/ ISETP.GT.U32.AND P1, PT, R0, 0x1f, PT ; | |
/*02f0*/ BMOV.32.CLEAR RZ, B0 ; | |
/*0300*/ BSSY B0, `(.L_4) ; | |
/*0310*/ @P1 BRA `(.L_6) ; | |
/*0320*/ LDS.U R5, [R2] ; | |
/*0330*/ BRA.DIV `(.L_7) ; | |
/*0340*/ SHFL.BFLY PT, R6, R5, 0x10, 0x1f ; | |
/*0350*/ FADD R6, R5, R6 ; | |
/*0360*/ SHFL.BFLY PT, R7, R6, 0x8, 0x1f ; | |
/*0370*/ FADD R7, R6, R7 ; | |
/*0380*/ SHFL.BFLY PT, R8, R7, 0x4, 0x1f ; | |
/*0390*/ FADD R8, R7, R8 ; | |
/*03a0*/ SHFL.BFLY PT, R9, R8, 0x2, 0x1f ; | |
/*03b0*/ FADD R9, R8, R9 ; | |
/*03c0*/ SHFL.BFLY PT, R10, R9, 0x1, 0x1f ; | |
.L_18: | |
/*03d0*/ FADD R11, R10, R9 ; | |
.L_6: | |
/*03e0*/ BSYNC B0 ; | |
.L_4: | |
/*03f0*/ ISETP.NE.AND P1, PT, R0, RZ, PT ; | |
/*0400*/ BMOV.32.CLEAR RZ, B0 ; | |
/*0410*/ BSSY B0, `(.L_8) ; | |
/*0420*/ @P1 BRA `(.L_9) ; | |
/*0430*/ IMAD.MOV.U32 R6, RZ, RZ, 0x4 ; | |
/*0440*/ MOV R5, 0x1 ; | |
/*0450*/ IMAD.MOV.U32 R8, RZ, RZ, c[0x0][0x170] ; | |
/*0460*/ MOV R9, c[0x0][0x174] ; | |
/*0470*/ IMAD.WIDE.U32 R6, R3, R6, c[0x0][0x168] ; | |
/*0480*/ STG.E.STRONG.GPU [R6], R11 ; | |
/*0490*/ ATOMG.E.ADD.STRONG.GPU PT, R8, [R8], R5 ; | |
/*04a0*/ IADD3 R3, R8, 0x1, RZ ; | |
/*04b0*/ STS [0x80], R3 ; | |
.L_9: | |
/*04c0*/ BSYNC B0 ; | |
.L_8: | |
/*04d0*/ WARPSYNC 0xffffffff ; | |
/*04e0*/ NOP ; | |
/*04f0*/ BAR.SYNC 0x0 ; | |
/*0500*/ LDS.U R3, [0x80] ; | |
/*0510*/ ISETP.NE.AND P2, PT, R3, c[0x0][0xc], PT ; | |
/*0520*/ @P2 EXIT ; | |
/*0530*/ ISETP.GT.U32.AND P2, PT, R0, c[0x0][0xc], PT ; | |
/*0540*/ BMOV.32.CLEAR RZ, B0 ; | |
/*0550*/ BSSY B0, `(.L_10) ; | |
/*0560*/ IMAD.MOV.U32 R3, RZ, RZ, RZ ; | |
/*0570*/ @P2 BRA `(.L_11) ; | |
/*0580*/ MOV R5, R0 ; | |
.L_12: | |
/*0590*/ IMAD.MOV.U32 R6, RZ, RZ, 0x4 ; | |
/*05a0*/ IMAD.WIDE.U32 R6, R5, R6, c[0x0][0x168] ; | |
/*05b0*/ LDG.E.STRONG.GPU R6, [R6] ; | |
/*05c0*/ IADD3 R5, R5, c[0x0][0x0], RZ ; | |
/*05d0*/ YIELD ; | |
/*05e0*/ ISETP.GT.U32.AND P2, PT, R5, c[0x0][0xc], PT ; | |
/*05f0*/ FADD R3, R6, R3 ; | |
/*0600*/ @!P2 BRA `(.L_12) ; | |
.L_11: | |
/*0610*/ BSYNC B0 ; | |
.L_10: | |
/*0620*/ BRA.DIV `(.L_13) ; | |
/*0630*/ SHFL.BFLY PT, R6, R3, 0x10, 0x1f ; | |
/*0640*/ FADD R6, R6, R3 ; | |
/*0650*/ SHFL.BFLY PT, R5, R6, 0x8, 0x1f ; | |
/*0660*/ FADD R5, R6, R5 ; | |
/*0670*/ SHFL.BFLY PT, R8, R5, 0x4, 0x1f ; | |
/*0680*/ FADD R8, R5, R8 ; | |
/*0690*/ SHFL.BFLY PT, R7, R8, 0x2, 0x1f ; | |
/*06a0*/ FADD R9, R8, R7 ; | |
/*06b0*/ SHFL.BFLY PT, R10, R9, 0x1, 0x1f ; | |
.L_19: | |
/*06c0*/ FADD R9, R10, R9 ; | |
/*06d0*/ @!P0 BRA `(.L_14) ; | |
/*06e0*/ MOV R3, c[0x0][0x0] ; | |
/*06f0*/ LOP3.LUT P0, RZ, R0, 0x1f, RZ, 0xc0, !PT ; | |
/*0700*/ ISETP.NE.AND P2, PT, R3, 0x400, PT ; | |
/*0710*/ @!P2 BRA `(.L_15) ; | |
/*0720*/ ISETP.GT.U32.AND P2, PT, R0, 0x1f, PT ; | |
/*0730*/ WARPSYNC 0xffffffff ; | |
/*0740*/ @!P2 STS [R2], RZ ; | |
/*0750*/ NOP ; | |
/*0760*/ BAR.SYNC 0x0 ; | |
.L_15: | |
/*0770*/ @!P0 STS [R4], R9 ; | |
/*0780*/ ISETP.GT.U32.AND P0, PT, R0, 0x1f, PT ; | |
/*0790*/ WARPSYNC 0xffffffff ; | |
/*07a0*/ BMOV.32.CLEAR RZ, B0 ; | |
/*07b0*/ BSSY B0, `(.L_14) ; | |
/*07c0*/ NOP ; | |
/*07d0*/ BAR.SYNC 0x0 ; | |
/*07e0*/ @P0 BRA `(.L_16) ; | |
/*07f0*/ LDS.U R2, [R2] ; | |
/*0800*/ BRA.DIV `(.L_17) ; | |
/*0810*/ SHFL.BFLY PT, R3, R2, 0x10, 0x1f ; | |
/*0820*/ FADD R3, R2, R3 ; | |
/*0830*/ SHFL.BFLY PT, R0, R3, 0x8, 0x1f ; | |
/*0840*/ FADD R0, R3, R0 ; | |
/*0850*/ SHFL.BFLY PT, R5, R0, 0x4, 0x1f ; | |
/*0860*/ FADD R5, R0, R5 ; | |
/*0870*/ SHFL.BFLY PT, R4, R5, 0x2, 0x1f ; | |
/*0880*/ FADD R4, R5, R4 ; | |
/*0890*/ SHFL.BFLY PT, R9, R4, 0x1, 0x1f ; | |
.L_20: | |
/*08a0*/ FADD R9, R9, R4 ; | |
.L_16: | |
/*08b0*/ BSYNC B0 ; | |
.L_14: | |
/*08c0*/ @P1 EXIT ; | |
/*08d0*/ MOV R2, c[0x0][0x160] ; | |
/*08e0*/ IMAD.MOV.U32 R3, RZ, RZ, c[0x0][0x164] ; | |
/*08f0*/ STG.E.SYS [R2], R9 ; | |
/*0900*/ EXIT ; | |
.L_7: | |
/*0910*/ IMAD.MOV.U32 R9, RZ, RZ, R5 ; | |
/*0920*/ MOV R6, 0x10 ; | |
/*0930*/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ; | |
/*0940*/ MOV R10, 0xffffffff ; | |
/*0950*/ MOV R8, 0x970 ; | |
/*0960*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ; | |
/*0970*/ FADD R9, R5, R10 ; | |
/*0980*/ MOV R7, 0x1f ; | |
/*0990*/ IMAD.MOV.U32 R6, RZ, RZ, 0x8 ; | |
/*09a0*/ MOV R8, 0x9d0 ; | |
/*09b0*/ IMAD.MOV.U32 R10, RZ, RZ, -0x1 ; | |
/*09c0*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ; | |
/*09d0*/ FADD R9, R9, R10 ; | |
/*09e0*/ MOV R6, 0x4 ; | |
/*09f0*/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ; | |
/*0a00*/ MOV R10, 0xffffffff ; | |
/*0a10*/ MOV R8, 0xa30 ; | |
/*0a20*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ; | |
/*0a30*/ FADD R9, R9, R10 ; | |
/*0a40*/ MOV R7, 0x1f ; | |
/*0a50*/ IMAD.MOV.U32 R6, RZ, RZ, 0x2 ; | |
/*0a60*/ MOV R8, 0xa90 ; | |
/*0a70*/ IMAD.MOV.U32 R10, RZ, RZ, -0x1 ; | |
/*0a80*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ; | |
/*0a90*/ FADD R9, R9, R10 ; | |
/*0aa0*/ MOV R6, 0x1 ; | |
/*0ab0*/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ; | |
/*0ac0*/ MOV R10, 0xffffffff ; | |
/*0ad0*/ MOV R8, 0xaf0 ; | |
/*0ae0*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ; | |
/*0af0*/ BRA `(.L_18) ; | |
.L_13: | |
/*0b00*/ MOV R9, R3 ; | |
/*0b10*/ IMAD.MOV.U32 R6, RZ, RZ, 0x10 ; | |
/*0b20*/ MOV R7, 0x1f ; | |
/*0b30*/ IMAD.MOV.U32 R10, RZ, RZ, -0x1 ; | |
/*0b40*/ MOV R8, 0xb60 ; | |
/*0b50*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ; | |
/*0b60*/ FADD R9, R3, R10 ; | |
/*0b70*/ MOV R6, 0x8 ; | |
/*0b80*/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ; | |
/*0b90*/ MOV R10, 0xffffffff ; | |
/*0ba0*/ MOV R8, 0xbc0 ; | |
/*0bb0*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ; | |
/*0bc0*/ FADD R9, R9, R10 ; | |
/*0bd0*/ MOV R7, 0x1f ; | |
/*0be0*/ IMAD.MOV.U32 R6, RZ, RZ, 0x4 ; | |
/*0bf0*/ MOV R8, 0xc20 ; | |
/*0c00*/ IMAD.MOV.U32 R10, RZ, RZ, -0x1 ; | |
/*0c10*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ; | |
/*0c20*/ FADD R9, R9, R10 ; | |
/*0c30*/ MOV R6, 0x2 ; | |
/*0c40*/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ; | |
/*0c50*/ MOV R10, 0xffffffff ; | |
/*0c60*/ MOV R8, 0xc80 ; | |
/*0c70*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ; | |
/*0c80*/ FADD R9, R9, R10 ; | |
/*0c90*/ MOV R7, 0x1f ; | |
/*0ca0*/ IMAD.MOV.U32 R6, RZ, RZ, 0x1 ; | |
/*0cb0*/ MOV R8, 0xce0 ; | |
/*0cc0*/ IMAD.MOV.U32 R10, RZ, RZ, -0x1 ; | |
/*0cd0*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ; | |
/*0ce0*/ BRA `(.L_19) ; | |
.L_17: | |
/*0cf0*/ IMAD.MOV.U32 R9, RZ, RZ, R2 ; | |
/*0d00*/ MOV R6, 0x10 ; | |
/*0d10*/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ; | |
/*0d20*/ MOV R10, 0xffffffff ; | |
/*0d30*/ MOV R8, 0xd50 ; | |
/*0d40*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ; | |
/*0d50*/ FADD R9, R2, R10 ; | |
/*0d60*/ MOV R7, 0x1f ; | |
/*0d70*/ IMAD.MOV.U32 R6, RZ, RZ, 0x8 ; | |
/*0d80*/ MOV R8, 0xdb0 ; | |
/*0d90*/ IMAD.MOV.U32 R10, RZ, RZ, -0x1 ; | |
/*0da0*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ; | |
/*0db0*/ FADD R9, R9, R10 ; | |
/*0dc0*/ MOV R6, 0x4 ; | |
/*0dd0*/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ; | |
/*0de0*/ MOV R10, 0xffffffff ; | |
/*0df0*/ MOV R8, 0xe10 ; | |
/*0e00*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ; | |
/*0e10*/ FADD R9, R9, R10 ; | |
/*0e20*/ MOV R7, 0x1f ; | |
/*0e30*/ IMAD.MOV.U32 R6, RZ, RZ, 0x2 ; | |
/*0e40*/ MOV R10, 0xffffffff ; | |
/*0e50*/ MOV R8, 0xe70 ; | |
/*0e60*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ; | |
/*0e70*/ FADD R4, R9, R10 ; | |
/*0e80*/ MOV R7, 0x1f ; | |
/*0e90*/ IMAD.MOV.U32 R6, RZ, RZ, 0x1 ; | |
/*0ea0*/ MOV R10, 0xffffffff ; | |
/*0eb0*/ IMAD.MOV.U32 R9, RZ, RZ, R4 ; | |
/*0ec0*/ MOV R8, 0xee0 ; | |
/*0ed0*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ; | |
/*0ee0*/ MOV R9, R10 ; | |
/*0ef0*/ BRA `(.L_20) ; | |
.weak $_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly | |
.type $_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly,@function | |
.size $_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly,(.L_56 - $_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) | |
$_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly: | |
/*0f00*/ WARPSYNC R10 ; | |
/*0f10*/ SHFL.BFLY PT, R10, R9, R6, R7 ; | |
/*0f20*/ MOV R6, R8 ; | |
/*0f30*/ MOV R7, 0x0 ; | |
/*0f40*/ RET.REL.NODEC R6 `(_Z18sum_squared_reducePfS_PjPKfj) ; | |
.L_21: | |
/*0f50*/ BRA `(.L_21); | |
/*0f60*/ NOP; | |
/*0f70*/ NOP; | |
.L_56: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment