louchenyao · July 31, 2020 01:49
diff --git a/bench_seq.cu b/bench_seq.cu
 // nvcc bench_seq.cu -O3 -gencode arch=compute_70,code=sm_70
 // Result on V100:
 // bench_seq        max throughput: 769.856 GiB/s
 // bench_seq_unroll max throughput: 833.067 GiB/s

 #include <iostream>

 __device__
 int64_t reduce(int64_t *buf, uint64_t s) {
    // The more effecient way is to reduce within the wrap firstly, but it's not the bottleneck
    buf[threadIdx.x] = s;
    __syncthreads();
    for (int i = blockDim.x/2; i > 0; i /= 2) {
        if (threadIdx.x < i) {
            buf[threadIdx.x] += buf[threadIdx.x + i];
        }
        __syncthreads();
    }
    if (threadIdx.x == 0) {
        return buf[0];
    }
    return 0;
 }

 __global__
 void bench_seq(int *a, int n, int64_t *res) {
    int items_per_block = n / gridDim.x;
    unsigned long long s = 0;
    for (int offset = items_per_block * blockIdx.x; offset < items_per_block * (blockIdx.x+1); offset += blockDim.x) {
        s += a[offset + threadIdx.x];
    }

    __shared__ int64_t buf[512];
    unsigned long long aggregate = reduce(buf, s);
    if (threadIdx.x == 0) {
        atomicAdd((unsigned long long*)res, aggregate);
    }
 }

 __global__
 void bench_seq_unroll(int *a, int n, int64_t *res) {
    int items_per_block = n / gridDim.x;
    int64_t s = 0;
    int reg_a[4];
    for (int offset = items_per_block * blockIdx.x; offset < items_per_block * (blockIdx.x+1); offset += blockDim.x*4) {
        #pragma unroll
        for (int i = 0; i < 4; i++) {
            reg_a[i] = a[offset + i*blockDim.x + threadIdx.x];
        }
        #pragma unroll
        for (int i = 0; i < 4; i++) {
            s += reg_a[i];
        }
    }

    __shared__ int64_t buf[512];
    unsigned long long aggregate = reduce(buf, s);
    if (threadIdx.x == 0) {
        atomicAdd((unsigned long long*)res, aggregate);
    }
 }

 #define BENCH(f, m) { \
    float t; \
    cudaEvent_t start, stop; \
    cudaEventCreate(&start); \
    cudaEventCreate(&stop); \
    cudaMemset(d_res, 0, sizeof(int64_t)); \
    cudaEventRecord(start, 0); \
    f<<<blocks, 512>>>(d_a, n, d_res); \
    cudaEventRecord(stop, 0); \
    cudaEventSynchronize(stop); \
    cudaEventElapsedTime(&t, start,stop); \
    int64_t h_res; \
    cudaMemcpy(&h_res, d_res, sizeof(h_res), cudaMemcpyDeviceToHost); \
    if (h_res != ans) { \
        std::cout << "Wrong Result: " << h_res << std::endl; \
        std::cout << "Expected: " << ans << std::endl; \
        return 1; \
    } \
    double thr = double(4)*n/(1<<30)*1000/t; \
    if (thr > m) { \
        m = thr; \
    } \
    std::cout << "------------" << std::endl \
         << "| 📛: " << #f << std::endl\
         << "|Blocks: " << blocks \
         << "\tTime: " << t \
         << "\tThroughput: " << thr \
         << std::endl; \
 }


 int main(int argc, char** argv)
 {
  int n = 1024 * (1 << 20); // 4 GB

  // generating
  int *h_a = new int[n];
  int *d_a;
  int64_t *d_res;
  int64_t ans = 0;
  for (int i = 0; i < n; i++) {
      h_a[i] = i;
      ans += h_a[i];
  }
  cudaMalloc((void**)&d_a, sizeof(int) * n);
  cudaMalloc((void**)&d_res, sizeof(int64_t));
  cudaMemcpy(d_a, h_a, sizeof(int) * n, cudaMemcpyHostToDevice);


  // run
  double seq_max = 0, seq_unroll_max = 0;
  for (int blocks = 8; blocks <= 8192 ; blocks *= 2) {
    for (int i = 0; i < 2; i++) {
        BENCH(bench_seq, seq_max);
        BENCH(bench_seq_unroll, seq_unroll_max);
    }
  }
  std::cout << "------------" << std::endl
            << "| bench_seq        max throughput: " << seq_max << std::endl
            << "| bench_seq_unroll max throughput: " << seq_unroll_max << std::endl;

  cudaFree(d_a);
  cudaFree(d_res);
  delete[] h_a;

  return 0;
 }
diff --git a/bench_seq.ptx b/bench_seq.ptx
 // nvcc -ptx -o bench_seq.ptx bench_seq.cu -O3 -gencode arch=compute_70,code=sm_70

 // Generated by NVIDIA NVVM Compiler
 //
 // Compiler Build ID: CL-27506705
 // Cuda compilation tools, release 10.2, V10.2.89
 // Based on LLVM 3.4svn
 //

 .version 6.5
 .target sm_70
 .address_size 64

 	// .globl	_Z9bench_seqPiiPl
 // _ZZ9bench_seqPiiPlE3buf has been demoted
 // _ZZ16bench_seq_unrollPiiPlE3buf has been demoted

 .visible .entry _Z9bench_seqPiiPl(
 	.param .u64 _Z9bench_seqPiiPl_param_0,
 	.param .u32 _Z9bench_seqPiiPl_param_1,
 	.param .u64 _Z9bench_seqPiiPl_param_2
 )
 {
 	.reg .pred 	%p<8>;
 	.reg .b32 	%r<27>;
 	.reg .b64 	%rd<21>;
 	// demoted variable
 	.shared .align 8 .b8 _ZZ9bench_seqPiiPlE3buf[4096];

 	ld.param.u64 	%rd8, [_Z9bench_seqPiiPl_param_0];
 	ld.param.u32 	%r11, [_Z9bench_seqPiiPl_param_1];
 	ld.param.u64 	%rd5, [_Z9bench_seqPiiPl_param_2];
 	cvta.to.global.u64 	%rd1, %rd8;
 	mov.u32 	%r12, %nctaid.x;
 	div.u32 	%r13, %r11, %r12;
 	mov.u32 	%r14, %ctaid.x;
 	mul.lo.s32 	%r25, %r14, %r13;
 	add.s32 	%r15, %r14, 1;
 	mul.lo.s32 	%r2, %r15, %r13;
 	mov.u32 	%r3, %tid.x;
 	mov.u64 	%rd20, 0;
 	mov.u32 	%r4, %ntid.x;
 	setp.ge.u32	%p2, %r25, %r2;
 	@%p2 bra 	BB0_2;

 // BBO_1 is the for loop

 BB0_1:
 	add.s32 	%r16, %r3, %r25;
 	mul.wide.u32 	%rd9, %r16, 4;
 	add.s64 	%rd10, %rd1, %rd9;
 	ld.global.s32 	%rd11, [%rd10]; // load
 	add.s64 	%rd20, %rd11, %rd20;
 	add.s32 	%r25, %r4, %r25;
 	setp.lt.u32	%p3, %r25, %r2;
 	@%p3 bra 	BB0_1;

 // the following is inlined reduce part

 BB0_2:
 	shl.b32 	%r17, %r3, 3;
 	mov.u32 	%r18, _ZZ9bench_seqPiiPlE3buf;
 	add.s32 	%r7, %r18, %r17;
 	st.shared.u64 	[%r7], %rd20;
 	bar.sync 	0;
 	shr.u32 	%r26, %r4, 1;
 	setp.eq.s32	%p4, %r26, 0;
 	@%p4 bra 	BB0_6;

 BB0_3:
 	setp.ge.u32	%p5, %r3, %r26;
 	@%p5 bra 	BB0_5;

 	add.s32 	%r19, %r26, %r3;
 	shl.b32 	%r20, %r19, 3;
 	add.s32 	%r22, %r18, %r20;
 	ld.shared.u64 	%rd12, [%r7];
 	ld.shared.u64 	%rd13, [%r22];
 	add.s64 	%rd14, %rd12, %rd13;
 	st.shared.u64 	[%r7], %rd14;

 BB0_5:
 	bar.sync 	0;
 	shr.u32 	%r23, %r26, 31;
 	add.s32 	%r24, %r26, %r23;
 	shr.s32 	%r10, %r24, 1;
 	setp.gt.s32	%p6, %r26, 1;
 	mov.u32 	%r26, %r10;
 	@%p6 bra 	BB0_3;

 BB0_6:
 	setp.eq.s32	%p1, %r3, 0;
 	setp.ne.s32	%p7, %r3, 0;
 	@%p7 bra 	BB0_8;

 	ld.shared.u64 	%rd15, [_ZZ9bench_seqPiiPlE3buf];
 	cvta.to.global.u64 	%rd16, %rd5;
 	selp.b64	%rd17, %rd15, 0, %p1;
 	atom.global.add.u64 	%rd18, [%rd16], %rd17;

 BB0_8:
 	ret;
 }

 	// .globl	_Z16bench_seq_unrollPiiPl
 .visible .entry _Z16bench_seq_unrollPiiPl(
 	.param .u64 _Z16bench_seq_unrollPiiPl_param_0,
 	.param .u32 _Z16bench_seq_unrollPiiPl_param_1,
 	.param .u64 _Z16bench_seq_unrollPiiPl_param_2
 )
 {
 	.reg .pred 	%p<8>;
 	.reg .b32 	%r<35>;
 	.reg .b64 	%rd<33>;
 	// demoted variable
 	.shared .align 8 .b8 _ZZ16bench_seq_unrollPiiPlE3buf[4096];

 	ld.param.u64 	%rd5, [_Z16bench_seq_unrollPiiPl_param_0];
 	ld.param.u32 	%r15, [_Z16bench_seq_unrollPiiPl_param_1];
 	ld.param.u64 	%rd6, [_Z16bench_seq_unrollPiiPl_param_2];
 	mov.u32 	%r16, %nctaid.x;
 	div.u32 	%r17, %r15, %r16;
 	mov.u32 	%r18, %ctaid.x;
 	mul.lo.s32 	%r33, %r18, %r17;
 	add.s32 	%r19, %r18, 1;
 	mul.lo.s32 	%r2, %r19, %r17;
 	mov.u64 	%rd32, 0;
 	mov.u32 	%r3, %ntid.x;
 	setp.ge.u32	%p2, %r33, %r2;
 	@%p2 bra 	BB1_3;

 	cvta.to.global.u64 	%rd1, %rd5;
 	shl.b32 	%r4, %r3, 2;
 	shl.b32 	%r5, %r3, 1;
 	mul.lo.s32 	%r6, %r3, 3;
 	mov.u64 	%rd32, 0;
 	mov.u32 	%r7, %tid.x;

 BB1_2:
 	add.s32 	%r20, %r33, %r7;
 	mul.wide.u32 	%rd9, %r20, 4;
 	add.s64 	%rd10, %rd1, %rd9;
 	add.s32 	%r21, %r20, %r3;
 	mul.wide.u32 	%rd11, %r21, 4;
 	add.s64 	%rd12, %rd1, %rd11;
 	add.s32 	%r22, %r20, %r5;
 	mul.wide.u32 	%rd13, %r22, 4;
 	add.s64 	%rd14, %rd1, %rd13;
 	add.s32 	%r23, %r20, %r6;
 	mul.wide.u32 	%rd15, %r23, 4;
 	add.s64 	%rd16, %rd1, %rd15;
 	ld.global.s32 	%rd17, [%rd10]; // load 1
 	add.s64 	%rd18, %rd17, %rd32;
 	ld.global.s32 	%rd19, [%rd12]; // load 2
 	add.s64 	%rd20, %rd19, %rd18;
 	ld.global.s32 	%rd21, [%rd14]; // load 3
 	add.s64 	%rd22, %rd21, %rd20;
 	ld.global.s32 	%rd23, [%rd16]; // load 4
 	add.s64 	%rd32, %rd23, %rd22;
 	add.s32 	%r33, %r4, %r33;
 	setp.lt.u32	%p3, %r33, %r2;
 	@%p3 bra 	BB1_2;

 BB1_3:
 	mov.u32 	%r10, %tid.x;
 	shl.b32 	%r24, %r10, 3;
 	mov.u32 	%r25, _ZZ16bench_seq_unrollPiiPlE3buf;
 	add.s32 	%r11, %r25, %r24;
 	st.shared.u64 	[%r11], %rd32;
 	bar.sync 	0;
 	shr.u32 	%r34, %r3, 1;
 	setp.eq.s32	%p4, %r34, 0;
 	@%p4 bra 	BB1_7;

 BB1_4:
 	setp.ge.u32	%p5, %r10, %r34;
 	@%p5 bra 	BB1_6;

 	add.s32 	%r27, %r34, %r10;
 	shl.b32 	%r28, %r27, 3;
 	add.s32 	%r30, %r25, %r28;
 	ld.shared.u64 	%rd24, [%r11];
 	ld.shared.u64 	%rd25, [%r30];
 	add.s64 	%rd26, %rd24, %rd25;
 	st.shared.u64 	[%r11], %rd26;

 BB1_6:
 	bar.sync 	0;
 	shr.u32 	%r31, %r34, 31;
 	add.s32 	%r32, %r34, %r31;
 	shr.s32 	%r14, %r32, 1;
 	setp.gt.s32	%p6, %r34, 1;
 	mov.u32 	%r34, %r14;
 	@%p6 bra 	BB1_4;

 BB1_7:
 	setp.eq.s32	%p1, %r10, 0;
 	setp.ne.s32	%p7, %r10, 0;
 	@%p7 bra 	BB1_9;

 	ld.shared.u64 	%rd27, [_ZZ16bench_seq_unrollPiiPlE3buf];
 	cvta.to.global.u64 	%rd28, %rd6;
 	selp.b64	%rd29, %rd27, 0, %p1;
 	atom.global.add.u64 	%rd30, [%rd28], %rd29;

 BB1_9:
 	ret;
 }
	// nvcc bench_seq.cu -O3 -gencode arch=compute_70,code=sm_70
	// Result on V100:
	// bench_seq max throughput: 769.856 GiB/s
	// bench_seq_unroll max throughput: 833.067 GiB/s

	#include <iostream>

	__device__
	int64_t reduce(int64_t *buf, uint64_t s) {
	// The more effecient way is to reduce within the wrap firstly, but it's not the bottleneck
	buf[threadIdx.x] = s;
	__syncthreads();
	for (int i = blockDim.x/2; i > 0; i /= 2) {
	if (threadIdx.x < i) {
	buf[threadIdx.x] += buf[threadIdx.x + i];
	}
	__syncthreads();
	}
	if (threadIdx.x == 0) {
	return buf[0];
	}
	return 0;
	}

	__global__
	void bench_seq(int a, int n, int64_t res) {
	int items_per_block = n / gridDim.x;
	unsigned long long s = 0;
	for (int offset = items_per_block * blockIdx.x; offset < items_per_block * (blockIdx.x+1); offset += blockDim.x) {
	s += a[offset + threadIdx.x];
	}

	__shared__ int64_t buf[512];
	unsigned long long aggregate = reduce(buf, s);
	if (threadIdx.x == 0) {
	atomicAdd((unsigned long long*)res, aggregate);
	}
	}

	__global__
	void bench_seq_unroll(int a, int n, int64_t res) {
	int items_per_block = n / gridDim.x;
	int64_t s = 0;
	int reg_a[4];
	for (int offset = items_per_block * blockIdx.x; offset < items_per_block * (blockIdx.x+1); offset += blockDim.x*4) {
	#pragma unroll
	for (int i = 0; i < 4; i++) {
	reg_a[i] = a[offset + i*blockDim.x + threadIdx.x];
	}
	#pragma unroll
	for (int i = 0; i < 4; i++) {
	s += reg_a[i];
	}
	}

	__shared__ int64_t buf[512];
	unsigned long long aggregate = reduce(buf, s);
	if (threadIdx.x == 0) {
	atomicAdd((unsigned long long*)res, aggregate);
	}
	}

	#define BENCH(f, m) { \
	float t; \
	cudaEvent_t start, stop; \
	cudaEventCreate(&start); \
	cudaEventCreate(&stop); \
	cudaMemset(d_res, 0, sizeof(int64_t)); \
	cudaEventRecord(start, 0); \
	f<<<blocks, 512>>>(d_a, n, d_res); \
	cudaEventRecord(stop, 0); \
	cudaEventSynchronize(stop); \
	cudaEventElapsedTime(&t, start,stop); \
	int64_t h_res; \
	cudaMemcpy(&h_res, d_res, sizeof(h_res), cudaMemcpyDeviceToHost); \
	if (h_res != ans) { \
	std::cout << "Wrong Result: " << h_res << std::endl; \
	std::cout << "Expected: " << ans << std::endl; \
	return 1; \
	} \
	double thr = double(4)n/(1<<30)1000/t; \
	if (thr > m) { \
	m = thr; \
	} \
	std::cout << "------------" << std::endl \
	<< "\| 📛: " << #f << std::endl\
	<< "\|Blocks: " << blocks \
	<< "\tTime: " << t \
	<< "\tThroughput: " << thr \
	<< std::endl; \
	}


	int main(int argc, char** argv)
	{
	int n = 1024 * (1 << 20); // 4 GB

	// generating
	int *h_a = new int[n];
	int *d_a;
	int64_t *d_res;
	int64_t ans = 0;
	for (int i = 0; i < n; i++) {
	h_a[i] = i;
	ans += h_a[i];
	}
	cudaMalloc((void*)&d_a, sizeof(int) n);
	cudaMalloc((void**)&d_res, sizeof(int64_t));
	cudaMemcpy(d_a, h_a, sizeof(int) * n, cudaMemcpyHostToDevice);


	// run
	double seq_max = 0, seq_unroll_max = 0;
	for (int blocks = 8; blocks <= 8192 ; blocks *= 2) {
	for (int i = 0; i < 2; i++) {
	BENCH(bench_seq, seq_max);
	BENCH(bench_seq_unroll, seq_unroll_max);
	}
	}
	std::cout << "------------" << std::endl
	<< "\| bench_seq max throughput: " << seq_max << std::endl
	<< "\| bench_seq_unroll max throughput: " << seq_unroll_max << std::endl;

	cudaFree(d_a);
	cudaFree(d_res);
	delete[] h_a;

	return 0;
	}
	// nvcc -ptx -o bench_seq.ptx bench_seq.cu -O3 -gencode arch=compute_70,code=sm_70

	// Generated by NVIDIA NVVM Compiler
	//
	// Compiler Build ID: CL-27506705
	// Cuda compilation tools, release 10.2, V10.2.89
	// Based on LLVM 3.4svn
	//

	.version 6.5
	.target sm_70
	.address_size 64

	// .globl _Z9bench_seqPiiPl
	// _ZZ9bench_seqPiiPlE3buf has been demoted
	// _ZZ16bench_seq_unrollPiiPlE3buf has been demoted

	.visible .entry _Z9bench_seqPiiPl(
	.param .u64 _Z9bench_seqPiiPl_param_0,
	.param .u32 _Z9bench_seqPiiPl_param_1,
	.param .u64 _Z9bench_seqPiiPl_param_2
	)
	{
	.reg .pred %p<8>;
	.reg .b32 %r<27>;
	.reg .b64 %rd<21>;
	// demoted variable
	.shared .align 8 .b8 _ZZ9bench_seqPiiPlE3buf[4096];

	ld.param.u64 %rd8, [_Z9bench_seqPiiPl_param_0];
	ld.param.u32 %r11, [_Z9bench_seqPiiPl_param_1];
	ld.param.u64 %rd5, [_Z9bench_seqPiiPl_param_2];
	cvta.to.global.u64 %rd1, %rd8;
	mov.u32 %r12, %nctaid.x;
	div.u32 %r13, %r11, %r12;
	mov.u32 %r14, %ctaid.x;
	mul.lo.s32 %r25, %r14, %r13;
	add.s32 %r15, %r14, 1;
	mul.lo.s32 %r2, %r15, %r13;
	mov.u32 %r3, %tid.x;
	mov.u64 %rd20, 0;
	mov.u32 %r4, %ntid.x;
	setp.ge.u32 %p2, %r25, %r2;
	@%p2 bra BB0_2;

	// BBO_1 is the for loop

	BB0_1:
	add.s32 %r16, %r3, %r25;
	mul.wide.u32 %rd9, %r16, 4;
	add.s64 %rd10, %rd1, %rd9;
	ld.global.s32 %rd11, [%rd10]; // load
	add.s64 %rd20, %rd11, %rd20;
	add.s32 %r25, %r4, %r25;
	setp.lt.u32 %p3, %r25, %r2;
	@%p3 bra BB0_1;

	// the following is inlined reduce part

	BB0_2:
	shl.b32 %r17, %r3, 3;
	mov.u32 %r18, _ZZ9bench_seqPiiPlE3buf;
	add.s32 %r7, %r18, %r17;
	st.shared.u64 [%r7], %rd20;
	bar.sync 0;
	shr.u32 %r26, %r4, 1;
	setp.eq.s32 %p4, %r26, 0;
	@%p4 bra BB0_6;

	BB0_3:
	setp.ge.u32 %p5, %r3, %r26;
	@%p5 bra BB0_5;

	add.s32 %r19, %r26, %r3;
	shl.b32 %r20, %r19, 3;
	add.s32 %r22, %r18, %r20;
	ld.shared.u64 %rd12, [%r7];
	ld.shared.u64 %rd13, [%r22];
	add.s64 %rd14, %rd12, %rd13;
	st.shared.u64 [%r7], %rd14;

	BB0_5:
	bar.sync 0;
	shr.u32 %r23, %r26, 31;
	add.s32 %r24, %r26, %r23;
	shr.s32 %r10, %r24, 1;
	setp.gt.s32 %p6, %r26, 1;
	mov.u32 %r26, %r10;
	@%p6 bra BB0_3;

	BB0_6:
	setp.eq.s32 %p1, %r3, 0;
	setp.ne.s32 %p7, %r3, 0;
	@%p7 bra BB0_8;

	ld.shared.u64 %rd15, [_ZZ9bench_seqPiiPlE3buf];
	cvta.to.global.u64 %rd16, %rd5;
	selp.b64 %rd17, %rd15, 0, %p1;
	atom.global.add.u64 %rd18, [%rd16], %rd17;

	BB0_8:
	ret;
	}

	// .globl _Z16bench_seq_unrollPiiPl
	.visible .entry _Z16bench_seq_unrollPiiPl(
	.param .u64 _Z16bench_seq_unrollPiiPl_param_0,
	.param .u32 _Z16bench_seq_unrollPiiPl_param_1,
	.param .u64 _Z16bench_seq_unrollPiiPl_param_2
	)
	{
	.reg .pred %p<8>;
	.reg .b32 %r<35>;
	.reg .b64 %rd<33>;
	// demoted variable
	.shared .align 8 .b8 _ZZ16bench_seq_unrollPiiPlE3buf[4096];

	ld.param.u64 %rd5, [_Z16bench_seq_unrollPiiPl_param_0];
	ld.param.u32 %r15, [_Z16bench_seq_unrollPiiPl_param_1];
	ld.param.u64 %rd6, [_Z16bench_seq_unrollPiiPl_param_2];
	mov.u32 %r16, %nctaid.x;
	div.u32 %r17, %r15, %r16;
	mov.u32 %r18, %ctaid.x;
	mul.lo.s32 %r33, %r18, %r17;
	add.s32 %r19, %r18, 1;
	mul.lo.s32 %r2, %r19, %r17;
	mov.u64 %rd32, 0;
	mov.u32 %r3, %ntid.x;
	setp.ge.u32 %p2, %r33, %r2;
	@%p2 bra BB1_3;

	cvta.to.global.u64 %rd1, %rd5;
	shl.b32 %r4, %r3, 2;
	shl.b32 %r5, %r3, 1;
	mul.lo.s32 %r6, %r3, 3;
	mov.u64 %rd32, 0;
	mov.u32 %r7, %tid.x;

	BB1_2:
	add.s32 %r20, %r33, %r7;
	mul.wide.u32 %rd9, %r20, 4;
	add.s64 %rd10, %rd1, %rd9;
	add.s32 %r21, %r20, %r3;
	mul.wide.u32 %rd11, %r21, 4;
	add.s64 %rd12, %rd1, %rd11;
	add.s32 %r22, %r20, %r5;
	mul.wide.u32 %rd13, %r22, 4;
	add.s64 %rd14, %rd1, %rd13;
	add.s32 %r23, %r20, %r6;
	mul.wide.u32 %rd15, %r23, 4;
	add.s64 %rd16, %rd1, %rd15;
	ld.global.s32 %rd17, [%rd10]; // load 1
	add.s64 %rd18, %rd17, %rd32;
	ld.global.s32 %rd19, [%rd12]; // load 2
	add.s64 %rd20, %rd19, %rd18;
	ld.global.s32 %rd21, [%rd14]; // load 3
	add.s64 %rd22, %rd21, %rd20;
	ld.global.s32 %rd23, [%rd16]; // load 4
	add.s64 %rd32, %rd23, %rd22;
	add.s32 %r33, %r4, %r33;
	setp.lt.u32 %p3, %r33, %r2;
	@%p3 bra BB1_2;

	BB1_3:
	mov.u32 %r10, %tid.x;
	shl.b32 %r24, %r10, 3;
	mov.u32 %r25, _ZZ16bench_seq_unrollPiiPlE3buf;
	add.s32 %r11, %r25, %r24;
	st.shared.u64 [%r11], %rd32;
	bar.sync 0;
	shr.u32 %r34, %r3, 1;
	setp.eq.s32 %p4, %r34, 0;
	@%p4 bra BB1_7;

	BB1_4:
	setp.ge.u32 %p5, %r10, %r34;
	@%p5 bra BB1_6;

	add.s32 %r27, %r34, %r10;
	shl.b32 %r28, %r27, 3;
	add.s32 %r30, %r25, %r28;
	ld.shared.u64 %rd24, [%r11];
	ld.shared.u64 %rd25, [%r30];
	add.s64 %rd26, %rd24, %rd25;
	st.shared.u64 [%r11], %rd26;

	BB1_6:
	bar.sync 0;
	shr.u32 %r31, %r34, 31;
	add.s32 %r32, %r34, %r31;
	shr.s32 %r14, %r32, 1;
	setp.gt.s32 %p6, %r34, 1;
	mov.u32 %r34, %r14;
	@%p6 bra BB1_4;

	BB1_7:
	setp.eq.s32 %p1, %r10, 0;
	setp.ne.s32 %p7, %r10, 0;
	@%p7 bra BB1_9;

	ld.shared.u64 %rd27, [_ZZ16bench_seq_unrollPiiPlE3buf];
	cvta.to.global.u64 %rd28, %rd6;
	selp.b64 %rd29, %rd27, 0, %p1;
	atom.global.add.u64 %rd30, [%rd28], %rd29;

	BB1_9:
	ret;
	}