allanmac · September 24, 2016 12:20 · allanmac · Aug 5, 2013 · allanmac · Aug 5, 2013
diff --git a/scan.cu b/scan.cu

 #include <stdio.h>

 //
 //
 //

 #define WARP_SIZE                     32
 #define VOLATILE                      volatile

 #define KERNEL_QUALIFIERS             extern "C" __global__
 #define DEVICE_FUNCTION_QUALIFIERS    __device__
 #define DEVICE_INTRINSIC_QUALIFIERS   __device__ __forceinline__

 //
 //
 //

 VOLATILE __shared__ struct
 {

 #if __CUDA_ARCH__ < 300
  unsigned int scratch[WARP_SIZE];
 #endif

 } shared;

 //
 //
 //

 DEVICE_INTRINSIC_QUALIFIERS
 unsigned int laneId()
 {
  unsigned int id;

  asm("mov.u32 %0, %%laneid;" : "=r"(id));

  return id;
 }

 DEVICE_INTRINSIC_QUALIFIERS
 unsigned int laneMaskEQ()
 {
 #if __CUDA_ARCH__ >= 200

  unsigned int id;

  asm("mov.u32 %0, %%lanemask_eq;" : "=r"(id));

  return id;

 #else

  return 1u << laneId();

 #endif
 }

 //
 //
 //

 /** 
 * Convert a warp-level inclusive scan to an exclusive scan by
 * shifting the lanes to the right and assigning an 'identity' value
 * to lane 0.
 *
 * @param v
 * 
 * @return scan result for lane
 */
 #if (__CUDA_ARCH__ >= 300)

 DEVICE_FUNCTION_QUALIFIERS
 unsigned int
 toExclusiveScan(unsigned int v, const unsigned int i)
 {
  asm("{                                   \n\t"
      "  .reg .pred p;                     \n\t"
      "  shfl.up.b32 %0|p, %0, 0x1,  0x0;  \n\t"
      "  @!p mov.u32 %0, %1;               \n\t"
      "}"
      : "+r"(v) : "r"(i));

  return v;
 }

 #endif

 //
 //
 //

 /** 
 * Warp-level "inclusive plus scan".
 *
 * PTX from PTX ISA PDF
 * 
 * @param v
 * 
 * @return scan result for lane
 */
 DEVICE_FUNCTION_QUALIFIERS
 unsigned int
 plusScan(unsigned int v, const bool inclusive)
 {
 #if (__CUDA_ARCH__ >= 300)

  asm("{                                  \n\t"
      "  .reg .u32  t;                    \n\t"
      "  .reg .pred p;                    \n\t"
      "  shfl.up.b32 t|p, %0, 0x1,  0x0;  \n\t"
      "  @p add.u32 %0, t, %0;            \n\t"
      "  shfl.up.b32 t|p, %0, 0x2,  0x0;  \n\t"
      "  @p add.u32 %0, t, %0;            \n\t"
      "  shfl.up.b32 t|p, %0, 0x4,  0x0;  \n\t"
      "  @p add.u32 %0, t, %0;            \n\t"
      "  shfl.up.b32 t|p, %0, 0x8,  0x0;  \n\t"
      "  @p add.u32 %0, t, %0;            \n\t"
      "  shfl.up.b32 t|p, %0, 0x10, 0x0;  \n\t"
      "  @p add.u32 %0, t, %0;            \n\t"
      "}"
      : "+r"(v));

  if (inclusive)
    return v;
  else
    return toExclusiveScan(v,0u);

 #else

  /*
  //
  // uncomment if you want to mask redundant shared stores
  //
  #define STORE_IF_LT_WARP_MINUS(l)             \
  if (lid < WARP_SIZE-l)                        \
  scratch[0] = v
  */

 #define STORE_IF_LT_WARP_MINUS(l)               \
  scratch[0] = v

  const    unsigned int  lid     = laneId();
  volatile unsigned int* scratch = shared.scratch + lid;

  if (inclusive)
    {
      scratch[0] = v;
    }
  else
    {
      if (lid == (WARP_SIZE-1))
        scratch[-31] = 0u;
      else
        scratch[1] = v;
    }

  v = scratch[0];

  if (lid >= 1)
    {
      v = v + scratch[-1];

      STORE_IF_LT_WARP_MINUS(2);

      if (lid >= 2)
        {
          v = v + scratch[-2];

          STORE_IF_LT_WARP_MINUS(4);

          if (lid >= 4)
            {
              v = v + scratch[-4];

              STORE_IF_LT_WARP_MINUS(8);

              if (lid >= 8)
                {
                  v = v + scratch[-8];

                  STORE_IF_LT_WARP_MINUS(16);

                  if (lid >= 16)
                    v = v + scratch[-16];
                }
            }
        }
    }

  return v;

 #endif
 }

 //
 //
 //

 KERNEL_QUALIFIERS
 void inclusivePlusScanKernel(const unsigned int* const vin,
                             unsigned int*       const vout)
 {
  unsigned int v    = vin[threadIdx.x];
  v                 = plusScan(v,true);
  vout[threadIdx.x] = v;
 }

 //
 //
 //

 KERNEL_QUALIFIERS
 void exclusivePlusScanKernel(const unsigned int* const vin,
                             unsigned int*       const vout)
 {
  unsigned int v    = vin[threadIdx.x];
  v                 = plusScan(v,false);
  vout[threadIdx.x] = v;
 }

 //
 //
 //

 void printScan(const char*         const msg,
               const unsigned int* const warp)
 {
  printf("%6s:",msg);
  
  for (int ii=0; ii<WARP_SIZE; ii++)
    printf("%2d ",warp[ii]);

  printf("\n");
 }

 //
 //
 //

 int main(int argc, char** argv)
 {
  // scan [device] [0=exclusive] -- otherwise defaults to inclusive
  
  const int  device    = (argc >= 2) ? atoi(argv[1])      : 0;
  const bool inclusive = (argc == 3) ? atoi(argv[2]) != 0 : true;

  cudaDeviceProp props;
  cudaGetDeviceProperties(&props,device);
 
  printf("%s (%2d)\n",props.name,props.multiProcessorCount);
  printf("%s scan ...\n",inclusive ? "inclusive" : "exclusive");

  cudaSetDevice(device);

  //
  // LAUNCH KERNEL
  //
 
  unsigned int* vin; 
  unsigned int* vout; 
 
  cudaMalloc(&vin, sizeof(unsigned int) * WARP_SIZE);
  cudaMalloc(&vout,sizeof(unsigned int) * WARP_SIZE);

  //
  //
  //

  unsigned int win[32] =
    {
      1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
      1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
    };
  
  cudaMemcpy(vin,win,sizeof(unsigned int) * WARP_SIZE,cudaMemcpyHostToDevice);

  //
  //
  //

  if (inclusive)
    inclusivePlusScanKernel<<<1,WARP_SIZE>>>(vin,vout);
  else
    exclusivePlusScanKernel<<<1,WARP_SIZE>>>(vin,vout);
  
  cudaDeviceSynchronize();

  //
  //
  //

  unsigned int wout[32];
  
  cudaMemcpy(wout,vout,sizeof(unsigned int) * WARP_SIZE,cudaMemcpyDeviceToHost);

  printScan("warp",win);
  printScan("scan",wout);
  
  //
  //
  //
 
  cudaFree(vin);
  cudaFree(vout);

  cudaDeviceReset();
 
  return 0;
 }

	#include <stdio.h>

	//
	//
	//

	#define WARP_SIZE 32
	#define VOLATILE volatile

	#define KERNEL_QUALIFIERS extern "C" __global__
	#define DEVICE_FUNCTION_QUALIFIERS __device__
	#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__

	//
	//
	//

	VOLATILE __shared__ struct
	{

	#if __CUDA_ARCH__ < 300
	unsigned int scratch[WARP_SIZE];
	#endif

	} shared;

	//
	//
	//

	DEVICE_INTRINSIC_QUALIFIERS
	unsigned int laneId()
	{
	unsigned int id;

	asm("mov.u32 %0, %%laneid;" : "=r"(id));

	return id;
	}

	DEVICE_INTRINSIC_QUALIFIERS
	unsigned int laneMaskEQ()
	{
	#if __CUDA_ARCH__ >= 200

	unsigned int id;

	asm("mov.u32 %0, %%lanemask_eq;" : "=r"(id));

	return id;

	#else

	return 1u << laneId();

	#endif
	}

	//
	//
	//

	/**
	* Convert a warp-level inclusive scan to an exclusive scan by
	* shifting the lanes to the right and assigning an 'identity' value
	* to lane 0.
	*
	* @param v
	*
	* @return scan result for lane
	*/
	#if (__CUDA_ARCH__ >= 300)

	DEVICE_FUNCTION_QUALIFIERS
	unsigned int
	toExclusiveScan(unsigned int v, const unsigned int i)
	{
	asm("{ \n\t"
	" .reg .pred p; \n\t"
	" shfl.up.b32 %0\|p, %0, 0x1, 0x0; \n\t"
	" @!p mov.u32 %0, %1; \n\t"
	"}"
	: "+r"(v) : "r"(i));

	return v;
	}

	#endif

	//
	//
	//

	/**
	* Warp-level "inclusive plus scan".
	*
	* PTX from PTX ISA PDF
	*
	* @param v
	*
	* @return scan result for lane
	*/
	DEVICE_FUNCTION_QUALIFIERS
	unsigned int
	plusScan(unsigned int v, const bool inclusive)
	{
	#if (__CUDA_ARCH__ >= 300)

	asm("{ \n\t"
	" .reg .u32 t; \n\t"
	" .reg .pred p; \n\t"
	" shfl.up.b32 t\|p, %0, 0x1, 0x0; \n\t"
	" @p add.u32 %0, t, %0; \n\t"
	" shfl.up.b32 t\|p, %0, 0x2, 0x0; \n\t"
	" @p add.u32 %0, t, %0; \n\t"
	" shfl.up.b32 t\|p, %0, 0x4, 0x0; \n\t"
	" @p add.u32 %0, t, %0; \n\t"
	" shfl.up.b32 t\|p, %0, 0x8, 0x0; \n\t"
	" @p add.u32 %0, t, %0; \n\t"
	" shfl.up.b32 t\|p, %0, 0x10, 0x0; \n\t"
	" @p add.u32 %0, t, %0; \n\t"
	"}"
	: "+r"(v));

	if (inclusive)
	return v;
	else
	return toExclusiveScan(v,0u);

	#else

	/*
	//
	// uncomment if you want to mask redundant shared stores
	//
	#define STORE_IF_LT_WARP_MINUS(l) \
	if (lid < WARP_SIZE-l) \
	scratch[0] = v
	*/

	#define STORE_IF_LT_WARP_MINUS(l) \
	scratch[0] = v

	const unsigned int lid = laneId();
	volatile unsigned int* scratch = shared.scratch + lid;

	if (inclusive)
	{
	scratch[0] = v;
	}
	else
	{
	if (lid == (WARP_SIZE-1))
	scratch[-31] = 0u;
	else
	scratch[1] = v;
	}

	v = scratch[0];

	if (lid >= 1)
	{
	v = v + scratch[-1];

	STORE_IF_LT_WARP_MINUS(2);

	if (lid >= 2)
	{
	v = v + scratch[-2];

	STORE_IF_LT_WARP_MINUS(4);

	if (lid >= 4)
	{
	v = v + scratch[-4];

	STORE_IF_LT_WARP_MINUS(8);

	if (lid >= 8)
	{
	v = v + scratch[-8];

	STORE_IF_LT_WARP_MINUS(16);

	if (lid >= 16)
	v = v + scratch[-16];
	}
	}
	}
	}

	return v;

	#endif
	}

	//
	//
	//

	KERNEL_QUALIFIERS
	void inclusivePlusScanKernel(const unsigned int* const vin,
	unsigned int* const vout)
	{
	unsigned int v = vin[threadIdx.x];
	v = plusScan(v,true);
	vout[threadIdx.x] = v;
	}

	//
	//
	//

	KERNEL_QUALIFIERS
	void exclusivePlusScanKernel(const unsigned int* const vin,
	unsigned int* const vout)
	{
	unsigned int v = vin[threadIdx.x];
	v = plusScan(v,false);
	vout[threadIdx.x] = v;
	}

	//
	//
	//

	void printScan(const char* const msg,
	const unsigned int* const warp)
	{
	printf("%6s:",msg);

	for (int ii=0; ii<WARP_SIZE; ii++)
	printf("%2d ",warp[ii]);

	printf("\n");
	}

	//
	//
	//

	int main(int argc, char** argv)
	{
	// scan [device] [0=exclusive] -- otherwise defaults to inclusive

	const int device = (argc >= 2) ? atoi(argv[1]) : 0;
	const bool inclusive = (argc == 3) ? atoi(argv[2]) != 0 : true;

	cudaDeviceProp props;
	cudaGetDeviceProperties(&props,device);

	printf("%s (%2d)\n",props.name,props.multiProcessorCount);
	printf("%s scan ...\n",inclusive ? "inclusive" : "exclusive");

	cudaSetDevice(device);

	//
	// LAUNCH KERNEL
	//

	unsigned int* vin;
	unsigned int* vout;

	cudaMalloc(&vin, sizeof(unsigned int) * WARP_SIZE);
	cudaMalloc(&vout,sizeof(unsigned int) * WARP_SIZE);

	//
	//
	//

	unsigned int win[32] =
	{
	1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
	1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
	};

	cudaMemcpy(vin,win,sizeof(unsigned int) * WARP_SIZE,cudaMemcpyHostToDevice);

	//
	//
	//

	if (inclusive)
	inclusivePlusScanKernel<<<1,WARP_SIZE>>>(vin,vout);
	else
	exclusivePlusScanKernel<<<1,WARP_SIZE>>>(vin,vout);

	cudaDeviceSynchronize();

	//
	//
	//

	unsigned int wout[32];

	cudaMemcpy(wout,vout,sizeof(unsigned int) * WARP_SIZE,cudaMemcpyDeviceToHost);

	printScan("warp",win);
	printScan("scan",wout);

	//
	//
	//

	cudaFree(vin);
	cudaFree(vout);

	cudaDeviceReset();

	return 0;
	}