scott-gray · June 9, 2020 17:42
diff --git a/tid_aware.cu b/tid_aware.cu

 // A case for making the compiler more threadIdx aware in conditional code.

 // Proposed solution:
 // Walk the dependacies of any predicate gating a shfl.sync to look for threadIdx.
 // Simulate all 1024 values of threadIdx with full predicate expression to see if it's warp uniform.
 // Or you can also check if only single thread is active for other opimizations (like in that atomic add).
 // This can't be that complicated to do.

 __device__ __forceinline__ float shfl_xor(float var, int laneMask)
 {
    float ret;
    asm ("shfl.sync.bfly.b32 %0, %1, %2, 0x1f, 0xffffffff;" : "=f"(ret) : "f"(var), "r"(laneMask));
    return ret;
 }
 __device__ __forceinline__ float  sum4(float4 a) { return (a.x + a.y) + (a.z + a.w); }

 __device__ __forceinline__ float cta_reduce_sum(float xsum)
 {
    uint tid = threadIdx.x;

    // reduce across the warp
    // This shuffle also produces messy sass when cta_reduce_sum is called conditinally
    for (int i = 16; i > 0; i >>= 1)
        xsum += shfl_xor(xsum, i);

    // if block is bigger than a warp, then reduce warps
    if (blockDim.x > 32)
    {
        __shared__ float Share[32];
        float4* Share4 = (float4*)Share;

        // Init shared to zero if needed
        if (blockDim.x != 1024)
        {
            if (tid < 32)
                Share[tid] = 0.0f;
            __syncthreads();
        }

        // store 1 warp reduced value to shared for each warp
        if ((tid & 31) == 0)
            Share[tid/32] = xsum;
        __syncthreads();

        // This is the problem code here:
        if (1)
        {
            // warp uniform shuffle.
            // compiler can't figure this out and generates messy branching code
            if (tid < 32)
            {
                // we could trim these shuffle ops depending on blockDim.x
                // keep it simple for illustrative purposes
                xsum = Share[tid];
                for (int i = 16; i > 0; i >>= 1)
                    xsum += shfl_xor(xsum, i);
            }
        }
        else
        {
            // Alternative using shared that's at least clean:
            if (tid == 0)
            {
                xsum = 0.0f;
                #pragma unroll 1
                for (int j = 0, s = 0; j < blockDim.x; j += 256, s += 2)
                    xsum += sum4(Share4[s]) + sum4(Share4[s+1]);
                *Share = xsum;
            }
            __syncthreads();
            xsum = *Share;
        }
    }
    return xsum;
 }


 __device__ __forceinline__ uint store_partial(float* PartialSum, uint* PartialCnt, float partial)
 {
    uint tid = threadIdx.x;
    __shared__ uint Share[1];

    // You can try swapping out conditional here to confuse compiler
    // if ((tid & 1023) == 0)
    if (tid == 0)
    {
        // store partial sum to global
        PartialSum += blockIdx.x;
        asm volatile ("st.relaxed.gpu.global.f32 [%0], %1;" :: "l"(PartialSum), "f"(partial) );

        // given stg, atom and ldgs are "strong" I believe this isn't needed to ensure ordering?
        //__threadfence();

        // Count the number of stored partial sums
        // Note here is a case that the compiler IS aware that there is only 1 thread active.
        // Otherwise it would see the warp uniform address and multiply constant atomic add value by number of active threads.
        // Try swapping out conditional above to see this.
        uint count;
        asm volatile ("atom.relaxed.gpu.global.add.u32 %0, [%1], 1;" : "=r"(count): "l"(PartialCnt) );
        *Share = count + 1;
    }
    __syncthreads();

    return *Share;
 }

 // Do a tensor wide sum squared reduction deterministically
 __global__ void sum_squared_reduce(float* SumSquared, float* PartialSum, uint* PartialCnt, const float* X, uint size)
 {
    uint tid = threadIdx.x;
    uint bid = blockIdx.x;

    float sum_squared = 0.0f;

    // tile the reduction among blocks and compute partial sums
    #pragma unroll 1
    for (uint offset = bid*blockDim.x + tid; offset < size; offset += gridDim.x*blockDim.x)
    {
        float x = __ldg(X + offset);
        sum_squared += x*x;
    }

    // reduce within this cta
    sum_squared = cta_reduce_sum(sum_squared);

    // store partial sum to global and check if we're the last block
    uint partial_cnt = store_partial(PartialSum, PartialCnt, sum_squared);

    // Note that this conditional is guaranteed to be warp uniform (no tid involved) but compiler is unable to deduce this.
    if (partial_cnt == gridDim.x)
    {
        // Last block completes the sum
        sum_squared = 0.0f;
        #pragma unroll 1
        for (uint offset = tid; offset <= gridDim.x; offset += blockDim.x)
        {
            // Do these ldg's also need to be strong to ensure ordering?
            float partial;
            asm ("ld.relaxed.gpu.global.f32 %0, [%1];" : "=f"(partial): "l"(PartialSum + offset) );
            sum_squared += partial;
        }
        // Final cta reduction
        // This time there's no way to avoid nasty shfl.sync branching because we're in the partial_cnt conditional
        sum_squared = cta_reduce_sum(sum_squared);

        // first thread outputs final reduction
        if (tid == 0)
            *SumSquared = sum_squared;
    }
 }
diff --git a/tid_aware.sass b/tid_aware.sass
        .headerflags    @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM70 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM70)"
        .elftype        @"ET_EXEC"


 //--------------------- .text._Z18sum_squared_reducePfS_PjPKfj --------------------------
        .section        .text._Z18sum_squared_reducePfS_PjPKfj,"ax",@progbits
        .sectionflags   @"SHF_BARRIERS=1"
        .sectioninfo    @"SHI_REGISTERS=14"
        .align  128
        .global         _Z18sum_squared_reducePfS_PjPKfj
        .type           _Z18sum_squared_reducePfS_PjPKfj,@function
        .size           _Z18sum_squared_reducePfS_PjPKfj,(.L_56 - _Z18sum_squared_reducePfS_PjPKfj)
        .other          _Z18sum_squared_reducePfS_PjPKfj,@"STO_CUDA_ENTRY STV_DEFAULT"
 _Z18sum_squared_reducePfS_PjPKfj:
 .text._Z18sum_squared_reducePfS_PjPKfj:
        /*0000*/                   MOV R1, c[0x0][0x28] ;
        /*0010*/                   @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;
        /*0020*/                   S2R R3, SR_CTAID.X ;
        /*0030*/                   BMOV.32.CLEAR RZ, B0 ;
        /*0040*/                   BSSY B0, `(.L_1) ;
        /*0050*/                   IMAD.MOV.U32 R6, RZ, RZ, RZ ;
        /*0060*/                   S2R R0, SR_TID.X ;
        /*0070*/                   IMAD R2, R3, c[0x0][0x0], R0 ;
        /*0080*/                   ISETP.GE.U32.AND P0, PT, R2, c[0x0][0x180], PT ;
        /*0090*/                   @P0 BRA `(.L_2) ;
        /*00a0*/                   MOV R6, RZ ;
 .L_3:
        /*00b0*/                   IMAD.MOV.U32 R5, RZ, RZ, 0x4 ;
        /*00c0*/                   IMAD.WIDE.U32 R4, R2, R5, c[0x0][0x178] ;
        /*00d0*/                   LDG.E.CONSTANT.SYS R5, [R4] ;
        /*00e0*/                   MOV R7, c[0x0][0xc] ;
        /*00f0*/                   IMAD R2, R7, c[0x0][0x0], R2 ;
        /*0100*/                   ISETP.GE.U32.AND P0, PT, R2, c[0x0][0x180], PT ;
        /*0110*/                   FFMA R6, R5, R5, R6 ;
        /*0120*/                   @!P0 BRA `(.L_3) ;
 .L_2:
        /*0130*/                   BSYNC B0 ;
 .L_1:
        /*0140*/                   SHFL.BFLY PT, R5, R6, 0x10, 0x1f ;
        /*0150*/                   IMAD.MOV.U32 R10, RZ, RZ, c[0x0][0x0] ;
        /*0160*/                   SHF.R.U32.HI R4, RZ, 0x3, R0 ;
        /*0170*/                   ISETP.GE.U32.AND P0, PT, R10, 0x21, PT ;
        /*0180*/                   LOP3.LUT R4, R4, 0x1ffffffc, RZ, 0xc0, !PT ;
        /*0190*/                   FADD R5, R5, R6 ;
        /*01a0*/                   SHFL.BFLY PT, R2, R5, 0x8, 0x1f ;
        /*01b0*/                   FADD R7, R5, R2 ;
        /*01c0*/                   SHFL.BFLY PT, R2, R7, 0x4, 0x1f ;
        /*01d0*/                   FADD R8, R7, R2 ;
        /*01e0*/                   SHFL.BFLY PT, R9, R8, 0x2, 0x1f ;
        /*01f0*/                   FADD R9, R8, R9 ;
        /*0200*/                   SHFL.BFLY PT, R2, R9, 0x1, 0x1f ;
        /*0210*/                   FADD R11, R9, R2 ;
        /*0220*/                   SHF.L.U32 R2, R0, 0x2, RZ ;
        /*0230*/                   @!P0 BRA `(.L_4) ;
        /*0240*/                   ISETP.NE.AND P2, PT, R10, 0x400, PT ;
        /*0250*/                   LOP3.LUT P1, RZ, R0, 0x1f, RZ, 0xc0, !PT ;
        /*0260*/                   @!P2 BRA `(.L_5) ;
        /*0270*/                   ISETP.GT.U32.AND P2, PT, R0, 0x1f, PT ;
        /*0280*/                   @!P2 STS [R2], RZ ;
        /*0290*/                   NOP ;
        /*02a0*/                   BAR.SYNC 0x0 ;
 .L_5:
        /*02b0*/                   @!P1 STS [R4], R11 ;
        /*02c0*/                   NOP ;
        /*02d0*/                   BAR.SYNC 0x0 ;
        /*02e0*/                   ISETP.GT.U32.AND P1, PT, R0, 0x1f, PT ;
        /*02f0*/                   BMOV.32.CLEAR RZ, B0 ;
        /*0300*/                   BSSY B0, `(.L_4) ;
        /*0310*/                   @P1 BRA `(.L_6) ;
        /*0320*/                   LDS.U R5, [R2] ;
        /*0330*/                   BRA.DIV `(.L_7) ;
        /*0340*/                   SHFL.BFLY PT, R6, R5, 0x10, 0x1f ;
        /*0350*/                   FADD R6, R5, R6 ;
        /*0360*/                   SHFL.BFLY PT, R7, R6, 0x8, 0x1f ;
        /*0370*/                   FADD R7, R6, R7 ;
        /*0380*/                   SHFL.BFLY PT, R8, R7, 0x4, 0x1f ;
        /*0390*/                   FADD R8, R7, R8 ;
        /*03a0*/                   SHFL.BFLY PT, R9, R8, 0x2, 0x1f ;
        /*03b0*/                   FADD R9, R8, R9 ;
        /*03c0*/                   SHFL.BFLY PT, R10, R9, 0x1, 0x1f ;
 .L_18:
        /*03d0*/                   FADD R11, R10, R9 ;
 .L_6:
        /*03e0*/                   BSYNC B0 ;
 .L_4:
        /*03f0*/                   ISETP.NE.AND P1, PT, R0, RZ, PT ;
        /*0400*/                   BMOV.32.CLEAR RZ, B0 ;
        /*0410*/                   BSSY B0, `(.L_8) ;
        /*0420*/                   @P1 BRA `(.L_9) ;
        /*0430*/                   IMAD.MOV.U32 R6, RZ, RZ, 0x4 ;
        /*0440*/                   MOV R5, 0x1 ;
        /*0450*/                   IMAD.MOV.U32 R8, RZ, RZ, c[0x0][0x170] ;
        /*0460*/                   MOV R9, c[0x0][0x174] ;
        /*0470*/                   IMAD.WIDE.U32 R6, R3, R6, c[0x0][0x168] ;
        /*0480*/                   STG.E.STRONG.GPU [R6], R11 ;
        /*0490*/                   ATOMG.E.ADD.STRONG.GPU PT, R8, [R8], R5 ;
        /*04a0*/                   IADD3 R3, R8, 0x1, RZ ;
        /*04b0*/                   STS [0x80], R3 ;
 .L_9:
        /*04c0*/                   BSYNC B0 ;
 .L_8:
        /*04d0*/                   WARPSYNC 0xffffffff ;
        /*04e0*/                   NOP ;
        /*04f0*/                   BAR.SYNC 0x0 ;
        /*0500*/                   LDS.U R3, [0x80] ;
        /*0510*/                   ISETP.NE.AND P2, PT, R3, c[0x0][0xc], PT ;
        /*0520*/                   @P2 EXIT ;
        /*0530*/                   ISETP.GT.U32.AND P2, PT, R0, c[0x0][0xc], PT ;
        /*0540*/                   BMOV.32.CLEAR RZ, B0 ;
        /*0550*/                   BSSY B0, `(.L_10) ;
        /*0560*/                   IMAD.MOV.U32 R3, RZ, RZ, RZ ;
        /*0570*/                   @P2 BRA `(.L_11) ;
        /*0580*/                   MOV R5, R0 ;
 .L_12:
        /*0590*/                   IMAD.MOV.U32 R6, RZ, RZ, 0x4 ;
        /*05a0*/                   IMAD.WIDE.U32 R6, R5, R6, c[0x0][0x168] ;
        /*05b0*/                   LDG.E.STRONG.GPU R6, [R6] ;
        /*05c0*/                   IADD3 R5, R5, c[0x0][0x0], RZ ;
        /*05d0*/                   YIELD ;
        /*05e0*/                   ISETP.GT.U32.AND P2, PT, R5, c[0x0][0xc], PT ;
        /*05f0*/                   FADD R3, R6, R3 ;
        /*0600*/                   @!P2 BRA `(.L_12) ;
 .L_11:
        /*0610*/                   BSYNC B0 ;
 .L_10:
        /*0620*/                   BRA.DIV `(.L_13) ;
        /*0630*/                   SHFL.BFLY PT, R6, R3, 0x10, 0x1f ;
        /*0640*/                   FADD R6, R6, R3 ;
        /*0650*/                   SHFL.BFLY PT, R5, R6, 0x8, 0x1f ;
        /*0660*/                   FADD R5, R6, R5 ;
        /*0670*/                   SHFL.BFLY PT, R8, R5, 0x4, 0x1f ;
        /*0680*/                   FADD R8, R5, R8 ;
        /*0690*/                   SHFL.BFLY PT, R7, R8, 0x2, 0x1f ;
        /*06a0*/                   FADD R9, R8, R7 ;
        /*06b0*/                   SHFL.BFLY PT, R10, R9, 0x1, 0x1f ;
 .L_19:
        /*06c0*/                   FADD R9, R10, R9 ;
        /*06d0*/                   @!P0 BRA `(.L_14) ;
        /*06e0*/                   MOV R3, c[0x0][0x0] ;
        /*06f0*/                   LOP3.LUT P0, RZ, R0, 0x1f, RZ, 0xc0, !PT ;
        /*0700*/                   ISETP.NE.AND P2, PT, R3, 0x400, PT ;
        /*0710*/                   @!P2 BRA `(.L_15) ;
        /*0720*/                   ISETP.GT.U32.AND P2, PT, R0, 0x1f, PT ;
        /*0730*/                   WARPSYNC 0xffffffff ;
        /*0740*/                   @!P2 STS [R2], RZ ;
        /*0750*/                   NOP ;
        /*0760*/                   BAR.SYNC 0x0 ;
 .L_15:
        /*0770*/                   @!P0 STS [R4], R9 ;
        /*0780*/                   ISETP.GT.U32.AND P0, PT, R0, 0x1f, PT ;
        /*0790*/                   WARPSYNC 0xffffffff ;
        /*07a0*/                   BMOV.32.CLEAR RZ, B0 ;
        /*07b0*/                   BSSY B0, `(.L_14) ;
        /*07c0*/                   NOP ;
        /*07d0*/                   BAR.SYNC 0x0 ;
        /*07e0*/                   @P0 BRA `(.L_16) ;
        /*07f0*/                   LDS.U R2, [R2] ;
        /*0800*/                   BRA.DIV `(.L_17) ;
        /*0810*/                   SHFL.BFLY PT, R3, R2, 0x10, 0x1f ;
        /*0820*/                   FADD R3, R2, R3 ;
        /*0830*/                   SHFL.BFLY PT, R0, R3, 0x8, 0x1f ;
        /*0840*/                   FADD R0, R3, R0 ;
        /*0850*/                   SHFL.BFLY PT, R5, R0, 0x4, 0x1f ;
        /*0860*/                   FADD R5, R0, R5 ;
        /*0870*/                   SHFL.BFLY PT, R4, R5, 0x2, 0x1f ;
        /*0880*/                   FADD R4, R5, R4 ;
        /*0890*/                   SHFL.BFLY PT, R9, R4, 0x1, 0x1f ;
 .L_20:
        /*08a0*/                   FADD R9, R9, R4 ;
 .L_16:
        /*08b0*/                   BSYNC B0 ;
 .L_14:
        /*08c0*/                   @P1 EXIT ;
        /*08d0*/                   MOV R2, c[0x0][0x160] ;
        /*08e0*/                   IMAD.MOV.U32 R3, RZ, RZ, c[0x0][0x164] ;
        /*08f0*/                   STG.E.SYS [R2], R9 ;
        /*0900*/                   EXIT ;
 .L_7:
        /*0910*/                   IMAD.MOV.U32 R9, RZ, RZ, R5 ;
        /*0920*/                   MOV R6, 0x10 ;
        /*0930*/                   IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
        /*0940*/                   MOV R10, 0xffffffff ;
        /*0950*/                   MOV R8, 0x970 ;
        /*0960*/                   CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
        /*0970*/                   FADD R9, R5, R10 ;
        /*0980*/                   MOV R7, 0x1f ;
        /*0990*/                   IMAD.MOV.U32 R6, RZ, RZ, 0x8 ;
        /*09a0*/                   MOV R8, 0x9d0 ;
        /*09b0*/                   IMAD.MOV.U32 R10, RZ, RZ, -0x1 ;
        /*09c0*/                   CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
        /*09d0*/                   FADD R9, R9, R10 ;
        /*09e0*/                   MOV R6, 0x4 ;
        /*09f0*/                   IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
        /*0a00*/                   MOV R10, 0xffffffff ;
        /*0a10*/                   MOV R8, 0xa30 ;
        /*0a20*/                   CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
        /*0a30*/                   FADD R9, R9, R10 ;
        /*0a40*/                   MOV R7, 0x1f ;
        /*0a50*/                   IMAD.MOV.U32 R6, RZ, RZ, 0x2 ;
        /*0a60*/                   MOV R8, 0xa90 ;
        /*0a70*/                   IMAD.MOV.U32 R10, RZ, RZ, -0x1 ;
        /*0a80*/                   CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
        /*0a90*/                   FADD R9, R9, R10 ;
        /*0aa0*/                   MOV R6, 0x1 ;
        /*0ab0*/                   IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
        /*0ac0*/                   MOV R10, 0xffffffff ;
        /*0ad0*/                   MOV R8, 0xaf0 ;
        /*0ae0*/                   CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
        /*0af0*/                   BRA `(.L_18) ;
 .L_13:
        /*0b00*/                   MOV R9, R3 ;
        /*0b10*/                   IMAD.MOV.U32 R6, RZ, RZ, 0x10 ;
        /*0b20*/                   MOV R7, 0x1f ;
        /*0b30*/                   IMAD.MOV.U32 R10, RZ, RZ, -0x1 ;
        /*0b40*/                   MOV R8, 0xb60 ;
        /*0b50*/                   CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
        /*0b60*/                   FADD R9, R3, R10 ;
        /*0b70*/                   MOV R6, 0x8 ;
        /*0b80*/                   IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
        /*0b90*/                   MOV R10, 0xffffffff ;
        /*0ba0*/                   MOV R8, 0xbc0 ;
        /*0bb0*/                   CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
        /*0bc0*/                   FADD R9, R9, R10 ;
        /*0bd0*/                   MOV R7, 0x1f ;
        /*0be0*/                   IMAD.MOV.U32 R6, RZ, RZ, 0x4 ;
        /*0bf0*/                   MOV R8, 0xc20 ;
        /*0c00*/                   IMAD.MOV.U32 R10, RZ, RZ, -0x1 ;
        /*0c10*/                   CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
        /*0c20*/                   FADD R9, R9, R10 ;
        /*0c30*/                   MOV R6, 0x2 ;
        /*0c40*/                   IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
        /*0c50*/                   MOV R10, 0xffffffff ;
        /*0c60*/                   MOV R8, 0xc80 ;
        /*0c70*/                   CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
        /*0c80*/                   FADD R9, R9, R10 ;
        /*0c90*/                   MOV R7, 0x1f ;
        /*0ca0*/                   IMAD.MOV.U32 R6, RZ, RZ, 0x1 ;
        /*0cb0*/                   MOV R8, 0xce0 ;
        /*0cc0*/                   IMAD.MOV.U32 R10, RZ, RZ, -0x1 ;
        /*0cd0*/                   CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
        /*0ce0*/                   BRA `(.L_19) ;
 .L_17:
        /*0cf0*/                   IMAD.MOV.U32 R9, RZ, RZ, R2 ;
        /*0d00*/                   MOV R6, 0x10 ;
        /*0d10*/                   IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
        /*0d20*/                   MOV R10, 0xffffffff ;
        /*0d30*/                   MOV R8, 0xd50 ;
        /*0d40*/                   CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
        /*0d50*/                   FADD R9, R2, R10 ;
        /*0d60*/                   MOV R7, 0x1f ;
        /*0d70*/                   IMAD.MOV.U32 R6, RZ, RZ, 0x8 ;
        /*0d80*/                   MOV R8, 0xdb0 ;
        /*0d90*/                   IMAD.MOV.U32 R10, RZ, RZ, -0x1 ;
        /*0da0*/                   CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
        /*0db0*/                   FADD R9, R9, R10 ;
        /*0dc0*/                   MOV R6, 0x4 ;
        /*0dd0*/                   IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
        /*0de0*/                   MOV R10, 0xffffffff ;
        /*0df0*/                   MOV R8, 0xe10 ;
        /*0e00*/                   CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
        /*0e10*/                   FADD R9, R9, R10 ;
        /*0e20*/                   MOV R7, 0x1f ;
        /*0e30*/                   IMAD.MOV.U32 R6, RZ, RZ, 0x2 ;
        /*0e40*/                   MOV R10, 0xffffffff ;
        /*0e50*/                   MOV R8, 0xe70 ;
        /*0e60*/                   CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
        /*0e70*/                   FADD R4, R9, R10 ;
        /*0e80*/                   MOV R7, 0x1f ;
        /*0e90*/                   IMAD.MOV.U32 R6, RZ, RZ, 0x1 ;
        /*0ea0*/                   MOV R10, 0xffffffff ;
        /*0eb0*/                   IMAD.MOV.U32 R9, RZ, RZ, R4 ;
        /*0ec0*/                   MOV R8, 0xee0 ;
        /*0ed0*/                   CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
        /*0ee0*/                   MOV R9, R10 ;
        /*0ef0*/                   BRA `(.L_20) ;
        .weak           $_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly
        .type           $_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly,@function
        .size           $_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly,(.L_56 - $_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly)
 $_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly:
        /*0f00*/                   WARPSYNC R10 ;
        /*0f10*/                   SHFL.BFLY PT, R10, R9, R6, R7 ;
        /*0f20*/                   MOV R6, R8 ;
        /*0f30*/                   MOV R7, 0x0 ;
        /*0f40*/                   RET.REL.NODEC R6 `(_Z18sum_squared_reducePfS_PjPKfj) ;
 .L_21:
        /*0f50*/                   BRA `(.L_21);
        /*0f60*/                   NOP;
        /*0f70*/                   NOP;
 .L_56:

	// A case for making the compiler more threadIdx aware in conditional code.

	// Proposed solution:
	// Walk the dependacies of any predicate gating a shfl.sync to look for threadIdx.
	// Simulate all 1024 values of threadIdx with full predicate expression to see if it's warp uniform.
	// Or you can also check if only single thread is active for other opimizations (like in that atomic add).
	// This can't be that complicated to do.

	__device__ __forceinline__ float shfl_xor(float var, int laneMask)
	{
	float ret;
	asm ("shfl.sync.bfly.b32 %0, %1, %2, 0x1f, 0xffffffff;" : "=f"(ret) : "f"(var), "r"(laneMask));
	return ret;
	}
	__device__ __forceinline__ float sum4(float4 a) { return (a.x + a.y) + (a.z + a.w); }

	__device__ __forceinline__ float cta_reduce_sum(float xsum)
	{
	uint tid = threadIdx.x;

	// reduce across the warp
	// This shuffle also produces messy sass when cta_reduce_sum is called conditinally
	for (int i = 16; i > 0; i >>= 1)
	xsum += shfl_xor(xsum, i);

	// if block is bigger than a warp, then reduce warps
	if (blockDim.x > 32)
	{
	__shared__ float Share[32];
	float4* Share4 = (float4*)Share;

	// Init shared to zero if needed
	if (blockDim.x != 1024)
	{
	if (tid < 32)
	Share[tid] = 0.0f;
	__syncthreads();
	}

	// store 1 warp reduced value to shared for each warp
	if ((tid & 31) == 0)
	Share[tid/32] = xsum;
	__syncthreads();

	// This is the problem code here:
	if (1)
	{
	// warp uniform shuffle.
	// compiler can't figure this out and generates messy branching code
	if (tid < 32)
	{
	// we could trim these shuffle ops depending on blockDim.x
	// keep it simple for illustrative purposes
	xsum = Share[tid];
	for (int i = 16; i > 0; i >>= 1)
	xsum += shfl_xor(xsum, i);
	}
	}
	else
	{
	// Alternative using shared that's at least clean:
	if (tid == 0)
	{
	xsum = 0.0f;
	#pragma unroll 1
	for (int j = 0, s = 0; j < blockDim.x; j += 256, s += 2)
	xsum += sum4(Share4[s]) + sum4(Share4[s+1]);
	*Share = xsum;
	}
	__syncthreads();
	xsum = *Share;
	}
	}
	return xsum;
	}


	__device__ __forceinline__ uint store_partial(float* PartialSum, uint* PartialCnt, float partial)
	{
	uint tid = threadIdx.x;
	__shared__ uint Share[1];

	// You can try swapping out conditional here to confuse compiler
	// if ((tid & 1023) == 0)
	if (tid == 0)
	{
	// store partial sum to global
	PartialSum += blockIdx.x;
	asm volatile ("st.relaxed.gpu.global.f32 [%0], %1;" :: "l"(PartialSum), "f"(partial) );

	// given stg, atom and ldgs are "strong" I believe this isn't needed to ensure ordering?
	//__threadfence();

	// Count the number of stored partial sums
	// Note here is a case that the compiler IS aware that there is only 1 thread active.
	// Otherwise it would see the warp uniform address and multiply constant atomic add value by number of active threads.
	// Try swapping out conditional above to see this.
	uint count;
	asm volatile ("atom.relaxed.gpu.global.add.u32 %0, [%1], 1;" : "=r"(count): "l"(PartialCnt) );
	*Share = count + 1;
	}
	__syncthreads();

	return *Share;
	}

	// Do a tensor wide sum squared reduction deterministically
	__global__ void sum_squared_reduce(float* SumSquared, float* PartialSum, uint* PartialCnt, const float* X, uint size)
	{
	uint tid = threadIdx.x;
	uint bid = blockIdx.x;

	float sum_squared = 0.0f;

	// tile the reduction among blocks and compute partial sums
	#pragma unroll 1
	for (uint offset = bidblockDim.x + tid; offset < size; offset += gridDim.xblockDim.x)
	{
	float x = __ldg(X + offset);
	sum_squared += x*x;
	}

	// reduce within this cta
	sum_squared = cta_reduce_sum(sum_squared);

	// store partial sum to global and check if we're the last block
	uint partial_cnt = store_partial(PartialSum, PartialCnt, sum_squared);

	// Note that this conditional is guaranteed to be warp uniform (no tid involved) but compiler is unable to deduce this.
	if (partial_cnt == gridDim.x)
	{
	// Last block completes the sum
	sum_squared = 0.0f;
	#pragma unroll 1
	for (uint offset = tid; offset <= gridDim.x; offset += blockDim.x)
	{
	// Do these ldg's also need to be strong to ensure ordering?
	float partial;
	asm ("ld.relaxed.gpu.global.f32 %0, [%1];" : "=f"(partial): "l"(PartialSum + offset) );
	sum_squared += partial;
	}
	// Final cta reduction
	// This time there's no way to avoid nasty shfl.sync branching because we're in the partial_cnt conditional
	sum_squared = cta_reduce_sum(sum_squared);

	// first thread outputs final reduction
	if (tid == 0)
	*SumSquared = sum_squared;
	}
	}
	.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM70 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM70)"
	.elftype @"ET_EXEC"


	//--------------------- .text._Z18sum_squared_reducePfS_PjPKfj --------------------------
	.section .text._Z18sum_squared_reducePfS_PjPKfj,"ax",@progbits
	.sectionflags @"SHF_BARRIERS=1"
	.sectioninfo @"SHI_REGISTERS=14"
	.align 128
	.global _Z18sum_squared_reducePfS_PjPKfj
	.type _Z18sum_squared_reducePfS_PjPKfj,@function
	.size _Z18sum_squared_reducePfS_PjPKfj,(.L_56 - _Z18sum_squared_reducePfS_PjPKfj)
	.other _Z18sum_squared_reducePfS_PjPKfj,@"STO_CUDA_ENTRY STV_DEFAULT"
	_Z18sum_squared_reducePfS_PjPKfj:
	.text._Z18sum_squared_reducePfS_PjPKfj:
	/0000/ MOV R1, c[0x0][0x28] ;
	/0010/ @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;
	/0020/ S2R R3, SR_CTAID.X ;
	/0030/ BMOV.32.CLEAR RZ, B0 ;
	/0040/ BSSY B0, `(.L_1) ;
	/0050/ IMAD.MOV.U32 R6, RZ, RZ, RZ ;
	/0060/ S2R R0, SR_TID.X ;
	/0070/ IMAD R2, R3, c[0x0][0x0], R0 ;
	/0080/ ISETP.GE.U32.AND P0, PT, R2, c[0x0][0x180], PT ;
	/0090/ @P0 BRA `(.L_2) ;
	/00a0/ MOV R6, RZ ;
	.L_3:
	/00b0/ IMAD.MOV.U32 R5, RZ, RZ, 0x4 ;
	/00c0/ IMAD.WIDE.U32 R4, R2, R5, c[0x0][0x178] ;
	/00d0/ LDG.E.CONSTANT.SYS R5, [R4] ;
	/00e0/ MOV R7, c[0x0][0xc] ;
	/00f0/ IMAD R2, R7, c[0x0][0x0], R2 ;
	/0100/ ISETP.GE.U32.AND P0, PT, R2, c[0x0][0x180], PT ;
	/0110/ FFMA R6, R5, R5, R6 ;
	/0120/ @!P0 BRA `(.L_3) ;
	.L_2:
	/0130/ BSYNC B0 ;
	.L_1:
	/0140/ SHFL.BFLY PT, R5, R6, 0x10, 0x1f ;
	/0150/ IMAD.MOV.U32 R10, RZ, RZ, c[0x0][0x0] ;
	/0160/ SHF.R.U32.HI R4, RZ, 0x3, R0 ;
	/0170/ ISETP.GE.U32.AND P0, PT, R10, 0x21, PT ;
	/0180/ LOP3.LUT R4, R4, 0x1ffffffc, RZ, 0xc0, !PT ;
	/0190/ FADD R5, R5, R6 ;
	/01a0/ SHFL.BFLY PT, R2, R5, 0x8, 0x1f ;
	/01b0/ FADD R7, R5, R2 ;
	/01c0/ SHFL.BFLY PT, R2, R7, 0x4, 0x1f ;
	/01d0/ FADD R8, R7, R2 ;
	/01e0/ SHFL.BFLY PT, R9, R8, 0x2, 0x1f ;
	/01f0/ FADD R9, R8, R9 ;
	/0200/ SHFL.BFLY PT, R2, R9, 0x1, 0x1f ;
	/0210/ FADD R11, R9, R2 ;
	/0220/ SHF.L.U32 R2, R0, 0x2, RZ ;
	/0230/ @!P0 BRA `(.L_4) ;
	/0240/ ISETP.NE.AND P2, PT, R10, 0x400, PT ;
	/0250/ LOP3.LUT P1, RZ, R0, 0x1f, RZ, 0xc0, !PT ;
	/0260/ @!P2 BRA `(.L_5) ;
	/0270/ ISETP.GT.U32.AND P2, PT, R0, 0x1f, PT ;
	/0280/ @!P2 STS [R2], RZ ;
	/0290/ NOP ;
	/02a0/ BAR.SYNC 0x0 ;
	.L_5:
	/02b0/ @!P1 STS [R4], R11 ;
	/02c0/ NOP ;
	/02d0/ BAR.SYNC 0x0 ;
	/02e0/ ISETP.GT.U32.AND P1, PT, R0, 0x1f, PT ;
	/02f0/ BMOV.32.CLEAR RZ, B0 ;
	/0300/ BSSY B0, `(.L_4) ;
	/0310/ @P1 BRA `(.L_6) ;
	/0320/ LDS.U R5, [R2] ;
	/0330/ BRA.DIV `(.L_7) ;
	/0340/ SHFL.BFLY PT, R6, R5, 0x10, 0x1f ;
	/0350/ FADD R6, R5, R6 ;
	/0360/ SHFL.BFLY PT, R7, R6, 0x8, 0x1f ;
	/0370/ FADD R7, R6, R7 ;
	/0380/ SHFL.BFLY PT, R8, R7, 0x4, 0x1f ;
	/0390/ FADD R8, R7, R8 ;
	/03a0/ SHFL.BFLY PT, R9, R8, 0x2, 0x1f ;
	/03b0/ FADD R9, R8, R9 ;
	/03c0/ SHFL.BFLY PT, R10, R9, 0x1, 0x1f ;
	.L_18:
	/03d0/ FADD R11, R10, R9 ;
	.L_6:
	/03e0/ BSYNC B0 ;
	.L_4:
	/03f0/ ISETP.NE.AND P1, PT, R0, RZ, PT ;
	/0400/ BMOV.32.CLEAR RZ, B0 ;
	/0410/ BSSY B0, `(.L_8) ;
	/0420/ @P1 BRA `(.L_9) ;
	/0430/ IMAD.MOV.U32 R6, RZ, RZ, 0x4 ;
	/0440/ MOV R5, 0x1 ;
	/0450/ IMAD.MOV.U32 R8, RZ, RZ, c[0x0][0x170] ;
	/0460/ MOV R9, c[0x0][0x174] ;
	/0470/ IMAD.WIDE.U32 R6, R3, R6, c[0x0][0x168] ;
	/0480/ STG.E.STRONG.GPU [R6], R11 ;
	/0490/ ATOMG.E.ADD.STRONG.GPU PT, R8, [R8], R5 ;
	/04a0/ IADD3 R3, R8, 0x1, RZ ;
	/04b0/ STS [0x80], R3 ;
	.L_9:
	/04c0/ BSYNC B0 ;
	.L_8:
	/04d0/ WARPSYNC 0xffffffff ;
	/04e0/ NOP ;
	/04f0/ BAR.SYNC 0x0 ;
	/0500/ LDS.U R3, [0x80] ;
	/0510/ ISETP.NE.AND P2, PT, R3, c[0x0][0xc], PT ;
	/0520/ @P2 EXIT ;
	/0530/ ISETP.GT.U32.AND P2, PT, R0, c[0x0][0xc], PT ;
	/0540/ BMOV.32.CLEAR RZ, B0 ;
	/0550/ BSSY B0, `(.L_10) ;
	/0560/ IMAD.MOV.U32 R3, RZ, RZ, RZ ;
	/0570/ @P2 BRA `(.L_11) ;
	/0580/ MOV R5, R0 ;
	.L_12:
	/0590/ IMAD.MOV.U32 R6, RZ, RZ, 0x4 ;
	/05a0/ IMAD.WIDE.U32 R6, R5, R6, c[0x0][0x168] ;
	/05b0/ LDG.E.STRONG.GPU R6, [R6] ;
	/05c0/ IADD3 R5, R5, c[0x0][0x0], RZ ;
	/05d0/ YIELD ;
	/05e0/ ISETP.GT.U32.AND P2, PT, R5, c[0x0][0xc], PT ;
	/05f0/ FADD R3, R6, R3 ;
	/0600/ @!P2 BRA `(.L_12) ;
	.L_11:
	/0610/ BSYNC B0 ;
	.L_10:
	/0620/ BRA.DIV `(.L_13) ;
	/0630/ SHFL.BFLY PT, R6, R3, 0x10, 0x1f ;
	/0640/ FADD R6, R6, R3 ;
	/0650/ SHFL.BFLY PT, R5, R6, 0x8, 0x1f ;
	/0660/ FADD R5, R6, R5 ;
	/0670/ SHFL.BFLY PT, R8, R5, 0x4, 0x1f ;
	/0680/ FADD R8, R5, R8 ;
	/0690/ SHFL.BFLY PT, R7, R8, 0x2, 0x1f ;
	/06a0/ FADD R9, R8, R7 ;
	/06b0/ SHFL.BFLY PT, R10, R9, 0x1, 0x1f ;
	.L_19:
	/06c0/ FADD R9, R10, R9 ;
	/06d0/ @!P0 BRA `(.L_14) ;
	/06e0/ MOV R3, c[0x0][0x0] ;
	/06f0/ LOP3.LUT P0, RZ, R0, 0x1f, RZ, 0xc0, !PT ;
	/0700/ ISETP.NE.AND P2, PT, R3, 0x400, PT ;
	/0710/ @!P2 BRA `(.L_15) ;
	/0720/ ISETP.GT.U32.AND P2, PT, R0, 0x1f, PT ;
	/0730/ WARPSYNC 0xffffffff ;
	/0740/ @!P2 STS [R2], RZ ;
	/0750/ NOP ;
	/0760/ BAR.SYNC 0x0 ;
	.L_15:
	/0770/ @!P0 STS [R4], R9 ;
	/0780/ ISETP.GT.U32.AND P0, PT, R0, 0x1f, PT ;
	/0790/ WARPSYNC 0xffffffff ;
	/07a0/ BMOV.32.CLEAR RZ, B0 ;
	/07b0/ BSSY B0, `(.L_14) ;
	/07c0/ NOP ;
	/07d0/ BAR.SYNC 0x0 ;
	/07e0/ @P0 BRA `(.L_16) ;
	/07f0/ LDS.U R2, [R2] ;
	/0800/ BRA.DIV `(.L_17) ;
	/0810/ SHFL.BFLY PT, R3, R2, 0x10, 0x1f ;
	/0820/ FADD R3, R2, R3 ;
	/0830/ SHFL.BFLY PT, R0, R3, 0x8, 0x1f ;
	/0840/ FADD R0, R3, R0 ;
	/0850/ SHFL.BFLY PT, R5, R0, 0x4, 0x1f ;
	/0860/ FADD R5, R0, R5 ;
	/0870/ SHFL.BFLY PT, R4, R5, 0x2, 0x1f ;
	/0880/ FADD R4, R5, R4 ;
	/0890/ SHFL.BFLY PT, R9, R4, 0x1, 0x1f ;
	.L_20:
	/08a0/ FADD R9, R9, R4 ;
	.L_16:
	/08b0/ BSYNC B0 ;
	.L_14:
	/08c0/ @P1 EXIT ;
	/08d0/ MOV R2, c[0x0][0x160] ;
	/08e0/ IMAD.MOV.U32 R3, RZ, RZ, c[0x0][0x164] ;
	/08f0/ STG.E.SYS [R2], R9 ;
	/0900/ EXIT ;
	.L_7:
	/0910/ IMAD.MOV.U32 R9, RZ, RZ, R5 ;
	/0920/ MOV R6, 0x10 ;
	/0930/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
	/0940/ MOV R10, 0xffffffff ;
	/0950/ MOV R8, 0x970 ;
	/0960/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
	/0970/ FADD R9, R5, R10 ;
	/0980/ MOV R7, 0x1f ;
	/0990/ IMAD.MOV.U32 R6, RZ, RZ, 0x8 ;
	/09a0/ MOV R8, 0x9d0 ;
	/09b0/ IMAD.MOV.U32 R10, RZ, RZ, -0x1 ;
	/09c0/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
	/09d0/ FADD R9, R9, R10 ;
	/09e0/ MOV R6, 0x4 ;
	/09f0/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
	/0a00/ MOV R10, 0xffffffff ;
	/0a10/ MOV R8, 0xa30 ;
	/0a20/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
	/0a30/ FADD R9, R9, R10 ;
	/0a40/ MOV R7, 0x1f ;
	/0a50/ IMAD.MOV.U32 R6, RZ, RZ, 0x2 ;
	/0a60/ MOV R8, 0xa90 ;
	/0a70/ IMAD.MOV.U32 R10, RZ, RZ, -0x1 ;
	/0a80/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
	/0a90/ FADD R9, R9, R10 ;
	/0aa0/ MOV R6, 0x1 ;
	/0ab0/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
	/0ac0/ MOV R10, 0xffffffff ;
	/0ad0/ MOV R8, 0xaf0 ;
	/0ae0/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
	/0af0/ BRA `(.L_18) ;
	.L_13:
	/0b00/ MOV R9, R3 ;
	/0b10/ IMAD.MOV.U32 R6, RZ, RZ, 0x10 ;
	/0b20/ MOV R7, 0x1f ;
	/0b30/ IMAD.MOV.U32 R10, RZ, RZ, -0x1 ;
	/0b40/ MOV R8, 0xb60 ;
	/0b50/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
	/0b60/ FADD R9, R3, R10 ;
	/0b70/ MOV R6, 0x8 ;
	/0b80/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
	/0b90/ MOV R10, 0xffffffff ;
	/0ba0/ MOV R8, 0xbc0 ;
	/0bb0/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
	/0bc0/ FADD R9, R9, R10 ;
	/0bd0/ MOV R7, 0x1f ;
	/0be0/ IMAD.MOV.U32 R6, RZ, RZ, 0x4 ;
	/0bf0/ MOV R8, 0xc20 ;
	/0c00/ IMAD.MOV.U32 R10, RZ, RZ, -0x1 ;
	/0c10/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
	/0c20/ FADD R9, R9, R10 ;
	/0c30/ MOV R6, 0x2 ;
	/0c40/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
	/0c50/ MOV R10, 0xffffffff ;
	/0c60/ MOV R8, 0xc80 ;
	/0c70/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
	/0c80/ FADD R9, R9, R10 ;
	/0c90/ MOV R7, 0x1f ;
	/0ca0/ IMAD.MOV.U32 R6, RZ, RZ, 0x1 ;
	/0cb0/ MOV R8, 0xce0 ;
	/0cc0/ IMAD.MOV.U32 R10, RZ, RZ, -0x1 ;
	/0cd0/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
	/0ce0/ BRA `(.L_19) ;
	.L_17:
	/0cf0/ IMAD.MOV.U32 R9, RZ, RZ, R2 ;
	/0d00/ MOV R6, 0x10 ;
	/0d10/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
	/0d20/ MOV R10, 0xffffffff ;
	/0d30/ MOV R8, 0xd50 ;
	/0d40/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
	/0d50/ FADD R9, R2, R10 ;
	/0d60/ MOV R7, 0x1f ;
	/0d70/ IMAD.MOV.U32 R6, RZ, RZ, 0x8 ;
	/0d80/ MOV R8, 0xdb0 ;
	/0d90/ IMAD.MOV.U32 R10, RZ, RZ, -0x1 ;
	/0da0/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
	/0db0/ FADD R9, R9, R10 ;
	/0dc0/ MOV R6, 0x4 ;
	/0dd0/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
	/0de0/ MOV R10, 0xffffffff ;
	/0df0/ MOV R8, 0xe10 ;
	/0e00/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
	/0e10/ FADD R9, R9, R10 ;
	/0e20/ MOV R7, 0x1f ;
	/0e30/ IMAD.MOV.U32 R6, RZ, RZ, 0x2 ;
	/0e40/ MOV R10, 0xffffffff ;
	/0e50/ MOV R8, 0xe70 ;
	/0e60/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
	/0e70/ FADD R4, R9, R10 ;
	/0e80/ MOV R7, 0x1f ;
	/0e90/ IMAD.MOV.U32 R6, RZ, RZ, 0x1 ;
	/0ea0/ MOV R10, 0xffffffff ;
	/0eb0/ IMAD.MOV.U32 R9, RZ, RZ, R4 ;
	/0ec0/ MOV R8, 0xee0 ;
	/0ed0/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
	/0ee0/ MOV R9, R10 ;
	/0ef0/ BRA `(.L_20) ;
	.weak $_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly
	.type $_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly,@function
	.size $_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly,(.L_56 - $_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly)
	$_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly:
	/0f00/ WARPSYNC R10 ;
	/0f10/ SHFL.BFLY PT, R10, R9, R6, R7 ;
	/0f20/ MOV R6, R8 ;
	/0f30/ MOV R7, 0x0 ;
	/0f40/ RET.REL.NODEC R6 `(_Z18sum_squared_reducePfS_PjPKfj) ;
	.L_21:
	/0f50/ BRA `(.L_21);
	/0f60/ NOP;
	/0f70/ NOP;
	.L_56: