allanmac · August 29, 2015 14:13 · allanmac · Jan 11, 2015
diff --git a/scan_64.cu b/scan_64.cu
 // -*- compile-command: "nvcc -m 32 -arch sm_50 -Xptxas=-v,-abi=no -cubin scan_64.cu" ; -*-

 #include <stdint.h>

 //
 //
 //

 #define KERNEL_QUALIFIERS                   __global__
 #define KERNEL_QUALIFIERS_EXTERN            extern     KERNEL_QUALIFIERS
 #define KERNEL_QUALIFIERS_EXTERN_C          extern "C" KERNEL_QUALIFIERS

 //
 //
 //

 #ifndef _DEBUG
 #define DEVICE_FUNCTION_QUALIFIERS          __device__ __forceinline__
 #define DEVICE_INTRINSIC_QUALIFIERS         __device__ __forceinline__
 #else
 #define DEVICE_FUNCTION_QUALIFIERS          __device__
 #define DEVICE_INTRINSIC_QUALIFIERS         __device__
 #endif

 //
 //
 //

 #define DEVICE_STATIC_FUNCTION_QUALIFIERS   static DEVICE_FUNCTION_QUALIFIERS
 #define DEVICE_STATIC_INTRINSIC_QUALIFIERS  static DEVICE_INTRINSIC_QUALIFIERS

 //
 //
 //

 #define RESTRICT                            __restrict__

 //
 //
 //

 #define WARP_SIZE                           32

 //
 //
 //

 typedef uint32_t u32;
 typedef int64_t  s64;

 //
 //
 //

 DEVICE_STATIC_INTRINSIC_QUALIFIERS
 s64
 warp_reduce_add_shuffled_v1(s64 v)
 {
  asm("{                                     \n\t"
      ".reg .s32 lo;                         \n\t"
      ".reg .s32 hi;                         \n\t"
      "mov.b64 {lo,hi}, %0;                  \n\t"
      ".reg .s32 lo_t;                       \n\t"
      ".reg .s32 hi_t;                       \n\t" :: "l"(v));

  for (u32 d = 16; d >= 1; d /= 2)
    asm("shfl.bfly.b32 lo_t, lo,   %0, 0x1F; \n\t"
        "shfl.bfly.b32 hi_t, hi,   %0, 0x1F; \n\t"
        "add.cc.s32    lo,   lo_t, lo;       \n\t"
        "addc.cc.s32   hi,   hi_t, hi;       \n\t" :: "r"(d));

  asm("mov.b64 %0, {lo,hi};                  \n\t"
      "}"                                          : "=l"(v));

  return v;
 }

 //
 //
 //

 DEVICE_STATIC_INTRINSIC_QUALIFIERS
 s64
 warp_reduce_add_shuffled_v2(s64 v)
 {
  for (u32 d = 16; d >= 1; d /= 2)
    {
      asm("{                                 \n\t"
          ".reg .s32 lo;                     \n\t"
          ".reg .s32 hi;                     \n\t"
          ".reg .s32 lo_t;                   \n\t"
          ".reg .s32 hi_t;                   \n\t"
          ".reg .s64 t;                      \n\t"
          "mov.b64 {lo,hi}, %0;              \n\t"
          "shfl.bfly.b32 lo_t, lo, %1, 0x1F; \n\t"
          "shfl.bfly.b32 hi_t, hi, %1, 0x1F; \n\t"
          "mov.b64 t, {lo_t,hi_t};           \n\t"
          "add.s64 %0, %0, t;                \n\t"
          "}"
          : "+l"(v) : "r"(d));
    }

  return v;
 }

 //
 //
 //

 KERNEL_QUALIFIERS_EXTERN_C
 void
 reduction_v1_kernel(const s64* const RESTRICT vin, s64* const RESTRICT vout)
 {
  s64 v = vin[threadIdx.x];
  
  v = warp_reduce_add_shuffled_v1(v);

  vout[threadIdx.x] = v;
 }

 //
 //
 //

 KERNEL_QUALIFIERS_EXTERN_C
 void
 reduction_v2_kernel(const s64* const RESTRICT vin, s64* const RESTRICT vout)
 {
  s64 v = vin[threadIdx.x];
  
  v = warp_reduce_add_shuffled_v2(v);

  vout[threadIdx.x] = v;
 }

 //
 //
 //
	// -- compile-command: "nvcc -m 32 -arch sm_50 -Xptxas=-v,-abi=no -cubin scan_64.cu" ; --

	#include <stdint.h>

	//
	//
	//

	#define KERNEL_QUALIFIERS __global__
	#define KERNEL_QUALIFIERS_EXTERN extern KERNEL_QUALIFIERS
	#define KERNEL_QUALIFIERS_EXTERN_C extern "C" KERNEL_QUALIFIERS

	//
	//
	//

	#ifndef _DEBUG
	#define DEVICE_FUNCTION_QUALIFIERS __device__ __forceinline__
	#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__
	#else
	#define DEVICE_FUNCTION_QUALIFIERS __device__
	#define DEVICE_INTRINSIC_QUALIFIERS __device__
	#endif

	//
	//
	//

	#define DEVICE_STATIC_FUNCTION_QUALIFIERS static DEVICE_FUNCTION_QUALIFIERS
	#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static DEVICE_INTRINSIC_QUALIFIERS

	//
	//
	//

	#define RESTRICT __restrict__

	//
	//
	//

	#define WARP_SIZE 32

	//
	//
	//

	typedef uint32_t u32;
	typedef int64_t s64;

	//
	//
	//

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	s64
	warp_reduce_add_shuffled_v1(s64 v)
	{
	asm("{ \n\t"
	".reg .s32 lo; \n\t"
	".reg .s32 hi; \n\t"
	"mov.b64 {lo,hi}, %0; \n\t"
	".reg .s32 lo_t; \n\t"
	".reg .s32 hi_t; \n\t" :: "l"(v));

	for (u32 d = 16; d >= 1; d /= 2)
	asm("shfl.bfly.b32 lo_t, lo, %0, 0x1F; \n\t"
	"shfl.bfly.b32 hi_t, hi, %0, 0x1F; \n\t"
	"add.cc.s32 lo, lo_t, lo; \n\t"
	"addc.cc.s32 hi, hi_t, hi; \n\t" :: "r"(d));

	asm("mov.b64 %0, {lo,hi}; \n\t"
	"}" : "=l"(v));

	return v;
	}

	//
	//
	//

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	s64
	warp_reduce_add_shuffled_v2(s64 v)
	{
	for (u32 d = 16; d >= 1; d /= 2)
	{
	asm("{ \n\t"
	".reg .s32 lo; \n\t"
	".reg .s32 hi; \n\t"
	".reg .s32 lo_t; \n\t"
	".reg .s32 hi_t; \n\t"
	".reg .s64 t; \n\t"
	"mov.b64 {lo,hi}, %0; \n\t"
	"shfl.bfly.b32 lo_t, lo, %1, 0x1F; \n\t"
	"shfl.bfly.b32 hi_t, hi, %1, 0x1F; \n\t"
	"mov.b64 t, {lo_t,hi_t}; \n\t"
	"add.s64 %0, %0, t; \n\t"
	"}"
	: "+l"(v) : "r"(d));
	}

	return v;
	}

	//
	//
	//

	KERNEL_QUALIFIERS_EXTERN_C
	void
	reduction_v1_kernel(const s64* const RESTRICT vin, s64* const RESTRICT vout)
	{
	s64 v = vin[threadIdx.x];

	v = warp_reduce_add_shuffled_v1(v);

	vout[threadIdx.x] = v;
	}

	//
	//
	//

	KERNEL_QUALIFIERS_EXTERN_C
	void
	reduction_v2_kernel(const s64* const RESTRICT vin, s64* const RESTRICT vout)
	{
	s64 v = vin[threadIdx.x];

	v = warp_reduce_add_shuffled_v2(v);

	vout[threadIdx.x] = v;
	}

	//
	//
	//