Skip to content

Instantly share code, notes, and snippets.

@allanmac
Last active August 29, 2015 14:13
Show Gist options
  • Save allanmac/e109b77bd81f05b243c6 to your computer and use it in GitHub Desktop.
Save allanmac/e109b77bd81f05b243c6 to your computer and use it in GitHub Desktop.
Signed 64-bit reduce-add using shuffles (untested)
// -*- compile-command: "nvcc -m 32 -arch sm_50 -Xptxas=-v,-abi=no -cubin scan_64.cu" ; -*-
#include <stdint.h>
//
//
//
#define KERNEL_QUALIFIERS __global__
#define KERNEL_QUALIFIERS_EXTERN extern KERNEL_QUALIFIERS
#define KERNEL_QUALIFIERS_EXTERN_C extern "C" KERNEL_QUALIFIERS
//
//
//
#ifndef _DEBUG
#define DEVICE_FUNCTION_QUALIFIERS __device__ __forceinline__
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__
#else
#define DEVICE_FUNCTION_QUALIFIERS __device__
#define DEVICE_INTRINSIC_QUALIFIERS __device__
#endif
//
//
//
#define DEVICE_STATIC_FUNCTION_QUALIFIERS static DEVICE_FUNCTION_QUALIFIERS
#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static DEVICE_INTRINSIC_QUALIFIERS
//
//
//
#define RESTRICT __restrict__
//
//
//
#define WARP_SIZE 32
//
//
//
typedef uint32_t u32;
typedef int64_t s64;
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s64
warp_reduce_add_shuffled_v1(s64 v)
{
asm("{ \n\t"
".reg .s32 lo; \n\t"
".reg .s32 hi; \n\t"
"mov.b64 {lo,hi}, %0; \n\t"
".reg .s32 lo_t; \n\t"
".reg .s32 hi_t; \n\t" :: "l"(v));
for (u32 d = 16; d >= 1; d /= 2)
asm("shfl.bfly.b32 lo_t, lo, %0, 0x1F; \n\t"
"shfl.bfly.b32 hi_t, hi, %0, 0x1F; \n\t"
"add.cc.s32 lo, lo_t, lo; \n\t"
"addc.cc.s32 hi, hi_t, hi; \n\t" :: "r"(d));
asm("mov.b64 %0, {lo,hi}; \n\t"
"}" : "=l"(v));
return v;
}
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s64
warp_reduce_add_shuffled_v2(s64 v)
{
for (u32 d = 16; d >= 1; d /= 2)
{
asm("{ \n\t"
".reg .s32 lo; \n\t"
".reg .s32 hi; \n\t"
".reg .s32 lo_t; \n\t"
".reg .s32 hi_t; \n\t"
".reg .s64 t; \n\t"
"mov.b64 {lo,hi}, %0; \n\t"
"shfl.bfly.b32 lo_t, lo, %1, 0x1F; \n\t"
"shfl.bfly.b32 hi_t, hi, %1, 0x1F; \n\t"
"mov.b64 t, {lo_t,hi_t}; \n\t"
"add.s64 %0, %0, t; \n\t"
"}"
: "+l"(v) : "r"(d));
}
return v;
}
//
//
//
KERNEL_QUALIFIERS_EXTERN_C
void
reduction_v1_kernel(const s64* const RESTRICT vin, s64* const RESTRICT vout)
{
s64 v = vin[threadIdx.x];
v = warp_reduce_add_shuffled_v1(v);
vout[threadIdx.x] = v;
}
//
//
//
KERNEL_QUALIFIERS_EXTERN_C
void
reduction_v2_kernel(const s64* const RESTRICT vin, s64* const RESTRICT vout)
{
s64 v = vin[threadIdx.x];
v = warp_reduce_add_shuffled_v2(v);
vout[threadIdx.x] = v;
}
//
//
//
@allanmac
Copy link
Author

cuobjdump -sass scan_b64.cubin:

    Function : reduction_v2_kernel
    .headerflags    @"EF_CUDA_SM50 EF_CUDA_PTX_SM(EF_CUDA_SM50)"

        S2R R6, SR_TID.X;                       
        ISCADD R0, R6.reuse, c[0x0][0x140], 0x3;
        LDG.CI.64 R4, [R0];                     

        SHFL.BFLY PT, R1, R4, 0x10, 0x1f;       
        IADD R3.CC, R4, R1;                     
        SHFL.BFLY PT, R1, R5, 0x10, 0x1f;       

        IADD.X R4, R5, R1;                      
        SHFL.BFLY PT, R2, R3, 0x8, 0x1f;        
        IADD R3.CC, R2, R3;                     

        SHFL.BFLY PT, R1, R4, 0x8, 0x1f;        
        IADD.X R4, R1, R4;                      
        SHFL.BFLY PT, R2, R3, 0x4, 0x1f;        

        IADD R1.CC, R2, R3;                     
        SHFL.BFLY PT, R0, R4, 0x4, 0x1f;        
        IADD.X R3, R0, R4;                      

        SHFL.BFLY PT, R2, R1, 0x2, 0x1f;        
        IADD R2.CC, R2, R1;                     
        SHFL.BFLY PT, R0, R3, 0x2, 0x1f;        

        IADD.X R3, R0, R3;                      
        SHFL.BFLY PT, R1, R2, 0x1, 0x1f;        
        IADD R0.CC, R1, R2;                     

        SHFL.BFLY PT, R1, R3, 0x1, 0x1f;        
        ISCADD R2, R6, c[0x0][0x144], 0x3;      
        IADD.X R1, R1, R3;                      

        STG.64 [R2], R0;                        
        EXIT;                                   
        BRA 0x118;                              

        NOP;                                    
        NOP;                                    
        NOP;                                    

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment