Last active
July 31, 2020 01:49
-
-
Save louchenyao/cde014599eba100abf67981ba0ade7e3 to your computer and use it in GitHub Desktop.
Benchmark of GPU sequential read performance
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// nvcc bench_seq.cu -O3 -gencode arch=compute_70,code=sm_70 | |
// Result on V100: | |
// bench_seq max throughput: 769.856 GiB/s | |
// bench_seq_unroll max throughput: 833.067 GiB/s | |
#include <iostream> | |
__device__ | |
int64_t reduce(int64_t *buf, uint64_t s) { | |
// The more effecient way is to reduce within the wrap firstly, but it's not the bottleneck | |
buf[threadIdx.x] = s; | |
__syncthreads(); | |
for (int i = blockDim.x/2; i > 0; i /= 2) { | |
if (threadIdx.x < i) { | |
buf[threadIdx.x] += buf[threadIdx.x + i]; | |
} | |
__syncthreads(); | |
} | |
if (threadIdx.x == 0) { | |
return buf[0]; | |
} | |
return 0; | |
} | |
__global__ | |
void bench_seq(int *a, int n, int64_t *res) { | |
int items_per_block = n / gridDim.x; | |
unsigned long long s = 0; | |
for (int offset = items_per_block * blockIdx.x; offset < items_per_block * (blockIdx.x+1); offset += blockDim.x) { | |
s += a[offset + threadIdx.x]; | |
} | |
__shared__ int64_t buf[512]; | |
unsigned long long aggregate = reduce(buf, s); | |
if (threadIdx.x == 0) { | |
atomicAdd((unsigned long long*)res, aggregate); | |
} | |
} | |
__global__ | |
void bench_seq_unroll(int *a, int n, int64_t *res) { | |
int items_per_block = n / gridDim.x; | |
int64_t s = 0; | |
int reg_a[4]; | |
for (int offset = items_per_block * blockIdx.x; offset < items_per_block * (blockIdx.x+1); offset += blockDim.x*4) { | |
#pragma unroll | |
for (int i = 0; i < 4; i++) { | |
reg_a[i] = a[offset + i*blockDim.x + threadIdx.x]; | |
} | |
#pragma unroll | |
for (int i = 0; i < 4; i++) { | |
s += reg_a[i]; | |
} | |
} | |
__shared__ int64_t buf[512]; | |
unsigned long long aggregate = reduce(buf, s); | |
if (threadIdx.x == 0) { | |
atomicAdd((unsigned long long*)res, aggregate); | |
} | |
} | |
#define BENCH(f, m) { \ | |
float t; \ | |
cudaEvent_t start, stop; \ | |
cudaEventCreate(&start); \ | |
cudaEventCreate(&stop); \ | |
cudaMemset(d_res, 0, sizeof(int64_t)); \ | |
cudaEventRecord(start, 0); \ | |
f<<<blocks, 512>>>(d_a, n, d_res); \ | |
cudaEventRecord(stop, 0); \ | |
cudaEventSynchronize(stop); \ | |
cudaEventElapsedTime(&t, start,stop); \ | |
int64_t h_res; \ | |
cudaMemcpy(&h_res, d_res, sizeof(h_res), cudaMemcpyDeviceToHost); \ | |
if (h_res != ans) { \ | |
std::cout << "Wrong Result: " << h_res << std::endl; \ | |
std::cout << "Expected: " << ans << std::endl; \ | |
return 1; \ | |
} \ | |
double thr = double(4)*n/(1<<30)*1000/t; \ | |
if (thr > m) { \ | |
m = thr; \ | |
} \ | |
std::cout << "------------" << std::endl \ | |
<< "| 📛: " << #f << std::endl\ | |
<< "|Blocks: " << blocks \ | |
<< "\tTime: " << t \ | |
<< "\tThroughput: " << thr \ | |
<< std::endl; \ | |
} | |
int main(int argc, char** argv) | |
{ | |
int n = 1024 * (1 << 20); // 4 GB | |
// generating | |
int *h_a = new int[n]; | |
int *d_a; | |
int64_t *d_res; | |
int64_t ans = 0; | |
for (int i = 0; i < n; i++) { | |
h_a[i] = i; | |
ans += h_a[i]; | |
} | |
cudaMalloc((void**)&d_a, sizeof(int) * n); | |
cudaMalloc((void**)&d_res, sizeof(int64_t)); | |
cudaMemcpy(d_a, h_a, sizeof(int) * n, cudaMemcpyHostToDevice); | |
// run | |
double seq_max = 0, seq_unroll_max = 0; | |
for (int blocks = 8; blocks <= 8192 ; blocks *= 2) { | |
for (int i = 0; i < 2; i++) { | |
BENCH(bench_seq, seq_max); | |
BENCH(bench_seq_unroll, seq_unroll_max); | |
} | |
} | |
std::cout << "------------" << std::endl | |
<< "| bench_seq max throughput: " << seq_max << std::endl | |
<< "| bench_seq_unroll max throughput: " << seq_unroll_max << std::endl; | |
cudaFree(d_a); | |
cudaFree(d_res); | |
delete[] h_a; | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// nvcc -ptx -o bench_seq.ptx bench_seq.cu -O3 -gencode arch=compute_70,code=sm_70 | |
// Generated by NVIDIA NVVM Compiler | |
// | |
// Compiler Build ID: CL-27506705 | |
// Cuda compilation tools, release 10.2, V10.2.89 | |
// Based on LLVM 3.4svn | |
// | |
.version 6.5 | |
.target sm_70 | |
.address_size 64 | |
// .globl _Z9bench_seqPiiPl | |
// _ZZ9bench_seqPiiPlE3buf has been demoted | |
// _ZZ16bench_seq_unrollPiiPlE3buf has been demoted | |
.visible .entry _Z9bench_seqPiiPl( | |
.param .u64 _Z9bench_seqPiiPl_param_0, | |
.param .u32 _Z9bench_seqPiiPl_param_1, | |
.param .u64 _Z9bench_seqPiiPl_param_2 | |
) | |
{ | |
.reg .pred %p<8>; | |
.reg .b32 %r<27>; | |
.reg .b64 %rd<21>; | |
// demoted variable | |
.shared .align 8 .b8 _ZZ9bench_seqPiiPlE3buf[4096]; | |
ld.param.u64 %rd8, [_Z9bench_seqPiiPl_param_0]; | |
ld.param.u32 %r11, [_Z9bench_seqPiiPl_param_1]; | |
ld.param.u64 %rd5, [_Z9bench_seqPiiPl_param_2]; | |
cvta.to.global.u64 %rd1, %rd8; | |
mov.u32 %r12, %nctaid.x; | |
div.u32 %r13, %r11, %r12; | |
mov.u32 %r14, %ctaid.x; | |
mul.lo.s32 %r25, %r14, %r13; | |
add.s32 %r15, %r14, 1; | |
mul.lo.s32 %r2, %r15, %r13; | |
mov.u32 %r3, %tid.x; | |
mov.u64 %rd20, 0; | |
mov.u32 %r4, %ntid.x; | |
setp.ge.u32 %p2, %r25, %r2; | |
@%p2 bra BB0_2; | |
// BBO_1 is the for loop | |
BB0_1: | |
add.s32 %r16, %r3, %r25; | |
mul.wide.u32 %rd9, %r16, 4; | |
add.s64 %rd10, %rd1, %rd9; | |
ld.global.s32 %rd11, [%rd10]; // load | |
add.s64 %rd20, %rd11, %rd20; | |
add.s32 %r25, %r4, %r25; | |
setp.lt.u32 %p3, %r25, %r2; | |
@%p3 bra BB0_1; | |
// the following is inlined reduce part | |
BB0_2: | |
shl.b32 %r17, %r3, 3; | |
mov.u32 %r18, _ZZ9bench_seqPiiPlE3buf; | |
add.s32 %r7, %r18, %r17; | |
st.shared.u64 [%r7], %rd20; | |
bar.sync 0; | |
shr.u32 %r26, %r4, 1; | |
setp.eq.s32 %p4, %r26, 0; | |
@%p4 bra BB0_6; | |
BB0_3: | |
setp.ge.u32 %p5, %r3, %r26; | |
@%p5 bra BB0_5; | |
add.s32 %r19, %r26, %r3; | |
shl.b32 %r20, %r19, 3; | |
add.s32 %r22, %r18, %r20; | |
ld.shared.u64 %rd12, [%r7]; | |
ld.shared.u64 %rd13, [%r22]; | |
add.s64 %rd14, %rd12, %rd13; | |
st.shared.u64 [%r7], %rd14; | |
BB0_5: | |
bar.sync 0; | |
shr.u32 %r23, %r26, 31; | |
add.s32 %r24, %r26, %r23; | |
shr.s32 %r10, %r24, 1; | |
setp.gt.s32 %p6, %r26, 1; | |
mov.u32 %r26, %r10; | |
@%p6 bra BB0_3; | |
BB0_6: | |
setp.eq.s32 %p1, %r3, 0; | |
setp.ne.s32 %p7, %r3, 0; | |
@%p7 bra BB0_8; | |
ld.shared.u64 %rd15, [_ZZ9bench_seqPiiPlE3buf]; | |
cvta.to.global.u64 %rd16, %rd5; | |
selp.b64 %rd17, %rd15, 0, %p1; | |
atom.global.add.u64 %rd18, [%rd16], %rd17; | |
BB0_8: | |
ret; | |
} | |
// .globl _Z16bench_seq_unrollPiiPl | |
.visible .entry _Z16bench_seq_unrollPiiPl( | |
.param .u64 _Z16bench_seq_unrollPiiPl_param_0, | |
.param .u32 _Z16bench_seq_unrollPiiPl_param_1, | |
.param .u64 _Z16bench_seq_unrollPiiPl_param_2 | |
) | |
{ | |
.reg .pred %p<8>; | |
.reg .b32 %r<35>; | |
.reg .b64 %rd<33>; | |
// demoted variable | |
.shared .align 8 .b8 _ZZ16bench_seq_unrollPiiPlE3buf[4096]; | |
ld.param.u64 %rd5, [_Z16bench_seq_unrollPiiPl_param_0]; | |
ld.param.u32 %r15, [_Z16bench_seq_unrollPiiPl_param_1]; | |
ld.param.u64 %rd6, [_Z16bench_seq_unrollPiiPl_param_2]; | |
mov.u32 %r16, %nctaid.x; | |
div.u32 %r17, %r15, %r16; | |
mov.u32 %r18, %ctaid.x; | |
mul.lo.s32 %r33, %r18, %r17; | |
add.s32 %r19, %r18, 1; | |
mul.lo.s32 %r2, %r19, %r17; | |
mov.u64 %rd32, 0; | |
mov.u32 %r3, %ntid.x; | |
setp.ge.u32 %p2, %r33, %r2; | |
@%p2 bra BB1_3; | |
cvta.to.global.u64 %rd1, %rd5; | |
shl.b32 %r4, %r3, 2; | |
shl.b32 %r5, %r3, 1; | |
mul.lo.s32 %r6, %r3, 3; | |
mov.u64 %rd32, 0; | |
mov.u32 %r7, %tid.x; | |
BB1_2: | |
add.s32 %r20, %r33, %r7; | |
mul.wide.u32 %rd9, %r20, 4; | |
add.s64 %rd10, %rd1, %rd9; | |
add.s32 %r21, %r20, %r3; | |
mul.wide.u32 %rd11, %r21, 4; | |
add.s64 %rd12, %rd1, %rd11; | |
add.s32 %r22, %r20, %r5; | |
mul.wide.u32 %rd13, %r22, 4; | |
add.s64 %rd14, %rd1, %rd13; | |
add.s32 %r23, %r20, %r6; | |
mul.wide.u32 %rd15, %r23, 4; | |
add.s64 %rd16, %rd1, %rd15; | |
ld.global.s32 %rd17, [%rd10]; // load 1 | |
add.s64 %rd18, %rd17, %rd32; | |
ld.global.s32 %rd19, [%rd12]; // load 2 | |
add.s64 %rd20, %rd19, %rd18; | |
ld.global.s32 %rd21, [%rd14]; // load 3 | |
add.s64 %rd22, %rd21, %rd20; | |
ld.global.s32 %rd23, [%rd16]; // load 4 | |
add.s64 %rd32, %rd23, %rd22; | |
add.s32 %r33, %r4, %r33; | |
setp.lt.u32 %p3, %r33, %r2; | |
@%p3 bra BB1_2; | |
BB1_3: | |
mov.u32 %r10, %tid.x; | |
shl.b32 %r24, %r10, 3; | |
mov.u32 %r25, _ZZ16bench_seq_unrollPiiPlE3buf; | |
add.s32 %r11, %r25, %r24; | |
st.shared.u64 [%r11], %rd32; | |
bar.sync 0; | |
shr.u32 %r34, %r3, 1; | |
setp.eq.s32 %p4, %r34, 0; | |
@%p4 bra BB1_7; | |
BB1_4: | |
setp.ge.u32 %p5, %r10, %r34; | |
@%p5 bra BB1_6; | |
add.s32 %r27, %r34, %r10; | |
shl.b32 %r28, %r27, 3; | |
add.s32 %r30, %r25, %r28; | |
ld.shared.u64 %rd24, [%r11]; | |
ld.shared.u64 %rd25, [%r30]; | |
add.s64 %rd26, %rd24, %rd25; | |
st.shared.u64 [%r11], %rd26; | |
BB1_6: | |
bar.sync 0; | |
shr.u32 %r31, %r34, 31; | |
add.s32 %r32, %r34, %r31; | |
shr.s32 %r14, %r32, 1; | |
setp.gt.s32 %p6, %r34, 1; | |
mov.u32 %r34, %r14; | |
@%p6 bra BB1_4; | |
BB1_7: | |
setp.eq.s32 %p1, %r10, 0; | |
setp.ne.s32 %p7, %r10, 0; | |
@%p7 bra BB1_9; | |
ld.shared.u64 %rd27, [_ZZ16bench_seq_unrollPiiPlE3buf]; | |
cvta.to.global.u64 %rd28, %rd6; | |
selp.b64 %rd29, %rd27, 0, %p1; | |
atom.global.add.u64 %rd30, [%rd28], %rd29; | |
BB1_9: | |
ret; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment