-
-
Save Hermann-SW/5b76907bd20d2f64a7561b58c0b7a52d to your computer and use it in GitHub Desktop.
/* | |
Compare single thread pure integer performance of CPUs and [AMD] GPUs | |
hipcc hip-magic.hip | |
cpplint --filter=-legal/copyright hip-magic.hip.cpp | |
cppcheck --enable=all --suppress=missingIncludeSystem hip-magic.hip.cpp --check-config | |
*/ | |
#include <sys/time.h> | |
#include <stdint.h> | |
#include <hip/hip_runtime.h> | |
#define HIP_CHECK(condition) assert(condition == hipSuccess); | |
#define Prime(i) ((B[(i)>>5] & (0x80000000UL >> ((i)%32))) != 0) | |
#define forall_odd_primes_less_than(p, m, block) \ | |
for ((p) = 3; (p) < (m); (p) += 2) \ | |
if (Prime((p))) \ | |
block | |
__global__ void gpuHelloWorld(int N, int L) { | |
int gid = blockIdx.x * blockDim.x + threadIdx.x; | |
if (gid < N) { | |
uint32_t B[] = {0x35145105, 0x4510414, 0x11411040, 0x45144001}; | |
uint8_t p, a, b, c, d; | |
for(int i=1; i <= L; ++i) { | |
forall_odd_primes_less_than(p, 64, | |
forall_odd_primes_less_than(a, p, | |
if Prime(2*p-a) { | |
forall_odd_primes_less_than(b, p, | |
if ( (b != a) && Prime(2*p-b) ) { | |
c = 3*p - (a+b); | |
if ( (c < 2*p) && (2*p-c != a) && (2*p-c != b) && Prime(c) && Prime(2*p-c) ) { | |
if (2*a+b > 2*p) { | |
d = 2*a + b - 2*p; // 3*p - (3*p-(a+b)) - (2*p-a) | |
if ( (d != a) && (d != b) && (d != 2*p-c) && Prime(d) && Prime(2*p-d) ) { | |
if (i == L) | |
printf("%3u|%3u|%3u|\n%3u|%3u|%3u|\n%3u|%3u|%3u|\n", | |
a, b, c, 2*p-d, p, d, 2*p-c, 2*p-b, 2*p-a); | |
goto fin; | |
} | |
} | |
} | |
} | |
) | |
} | |
) | |
) | |
fin: if (i == L) i=i; | |
} | |
} | |
} | |
int main(int argc, char *argv[]) { | |
int gridSize = 1; // #blocks | |
int blockSize = 1; // Threads per block | |
int N = gridSize * blockSize; // total #threads | |
int L = argc == 1 ? 10000 : atoi(argv[1]); | |
hipEvent_t start, stop; | |
HIP_CHECK(hipEventCreate(&start)); | |
HIP_CHECK(hipEventCreate(&stop)); | |
float elapsed_ms{}; | |
HIP_CHECK(hipEventRecord(start, hipStreamDefault)); | |
gpuHelloWorld<<<gridSize, blockSize>>>(N, L); // Launch kernel | |
HIP_CHECK(hipEventRecord(stop, hipStreamDefault)); | |
HIP_CHECK(hipEventSynchronize(stop)); | |
HIP_CHECK(hipEventElapsedTime(&elapsed_ms, start, stop)); | |
printf("%.1f ms\n", elapsed_ms); | |
HIP_CHECK(hipEventDestroy(stop)); | |
HIP_CHECK(hipEventDestroy(start)); | |
struct timeval tv0, tv1; | |
gettimeofday(&tv1, NULL); // wait for usec change | |
do gettimeofday(&tv0, NULL); while (tv0.tv_usec == tv1.tv_usec); | |
uint32_t B[] = {0x35145105, 0x4510414, 0x11411040, 0x45144001}; | |
uint8_t p, a, b, c, d; | |
for(int i=1; i <= L; ++i) { | |
forall_odd_primes_less_than(p, 64, | |
forall_odd_primes_less_than(a, p, | |
if Prime(2*p-a) { | |
forall_odd_primes_less_than(b, p, | |
if ( (b != a) && Prime(2*p-b) ) { | |
c = 3*p - (a+b); | |
if ( (c < 2*p) && (2*p-c != a) && (2*p-c != b) && Prime(c) && Prime(2*p-c) ) { | |
if (2*a+b > 2*p) { | |
d = 2*a + b - 2*p; // 3*p - (3*p-(a+b)) - (2*p-a) | |
if ( (d != a) && (d != b) && (d != 2*p-c) && Prime(d) && Prime(2*p-d) ) { | |
if (i == L) | |
printf("%3u|%3u|%3u|\n%3u|%3u|%3u|\n%3u|%3u|%3u|\n", | |
a, b, c, 2*p-d, p, d, 2*p-c, 2*p-b, 2*p-a); | |
goto fin; | |
} | |
} | |
} | |
} | |
) | |
} | |
) | |
) | |
fin: if (i == L) i=i; | |
} | |
gettimeofday(&tv1, NULL); | |
printf("%ldus\n", | |
1000000*(tv1.tv_sec-tv0.tv_sec)+tv1.tv_usec-tv0.tv_usec); | |
return 0; | |
} |
Thread: The smallest unit of execution. Each thread runs an instance of the kernel (GPU function) and performs calculations on a subset of data.
Block: A group of threads, often organized, can cooperate by sharing data in shared memory. Each block operates independently of others.
That means that each block (CU) can run different kernels at the same time.
So it is OK to split the many magic square computations to the 60 CUs of Radeon vii GPU.
Here the first thread of each CU is used.
Below code diff does that, executes L loops on each of the 60 CUs on GPU, or L*60 on CPU.
Now GPU is 7/6× faster than i3-6100 CPU (runtimes for 6 million magic square computations on GPU and CPU):
hermann@Radeon-vii:~/rocm-examples$ time ./a.out 100000
47| 29|101|
113| 59| 5|
17| 89| 71|
17465.1 ms
47| 29|101|
113| 59| 5|
17| 89| 71|
20195735us
real 0m40.245s
user 0m38.427s
sys 0m0.672s
hermann@Radeon-vii:~/rocm-examples$
Yes, i3-6100 CPU has 4 cores.
But running the same code in current example allows to run 64 threads per CU/block.
Which whould make the GPU 64/4*7/6=18.67× faster than i3-6100 CPU.
Diff:
hermann@Radeon-vii:~/rocm-examples$ diff hip-magic-f.hip hip-magic-f60.hip
20c20
< if (gid < N) {
---
> if (gid < N && gid % blockDim.x == 0) { // first thread of all blocks
37c37,38
< if (i == L)
---
> assert(a == 47 && b == 29);
> if (i == L && gid == 0) {
40c41
<
---
> }
56,57c57,58
< int gridSize = 1; // #blocks
< int blockSize = 1; // Threads per block
---
> int gridSize = 60; // #blocks
> int blockSize = 64; // Threads per block
88c89
< for(int i=1; i <= L; ++i) {
---
> for(int i=1; i <= L*gridSize; ++i) {
101c102,103
< if (i == L)
---
> assert(a == 47 && b == 29);
> if (i == L*gridSize)
hermann@Radeon-vii:~/rocm-examples$
New hip code
https://stamm-wilbrandt.de/forum/hip-magic-f21.hip
is the previous one, but utilizing only 21 CUs and not 60 CUs the Radeon vii CPU has:
hermann@Radeon-vii:~/rocm-examples$ diff hip-magic-f60.hip hip-magic-f21.hip
57c57
< int gridSize = 60; // #blocks
---
> int gridSize = 21; // #blocks
hermann@Radeon-vii:~/rocm-examples$
This will be used to see that serialization happens when overloading the GPU.
As baseline, a single execution on 21 CUs/blocks at a time, with 17465.2 ms on GPU:
$ time ./a.out 100000
47| 29|101|
113| 59| 5|
17| 89| 71|
17465.2 ms
47| 29|101|
113| 59| 5|
17| 89| 71|
7143001us
real 0m26.161s
user 0m25.258s
sys 0m0.370s
$
I request 100,000/100,001/100,002 loops for 1st/2nd/3rd instance.
Next, two runs in parallel, both with 2same" runtime on GPU as before.
No surprise, since 60 CUs are available, and 2×21=42 CUs requested.
Nice that really no ven small penalty is to be paid for GPU runtimes:
$ time ./a.out 100000 $ time ./a.out 100001
47| 29|101| 47| 29|101|
113| 59| 5| 113| 59| 5|
17| 89| 71| 17| 89| 71|
17465.2 ms 17464.2 ms
47| 29|101| 47| 29|101|
113| 59| 5| 113| 59| 5|
17| 89| 71| 17| 89| 71|
7143001us 7090378us
real 0m26.161s real 0m27.484s
user 0m25.258s user 0m25.205s
sys 0m0.370s sys 0m0.824s
$ $
Last, but not least, running three in parallel exhausts the 60 CUs available on Radeon vii GPU.
3×21=63 CUs is too much, and here we see serialization happening for 1st instance with more than 35s GPU time:
$ time ./a.out 100000 $ time ./a.out 100001 $ time ./a.out 100002
47| 29|101| 47| 29|101| 47| 29|101|
113| 59| 5| 113| 59| 5| 113| 59| 5|
17| 89| 71| 17| 89| 71| 17| 89| 71|
38433.2 ms 17704.3 ms 22241.4 ms
47| 29|101| 47| 29|101| 47| 29|101|
113| 59| 5| 113| 59| 5| 113| 59| 5|
17| 89| 71| 17| 89| 71| 17| 89| 71|
7069767us 11209558us 7203511us
real 0m46.338s real 0m29.913s real 0m32.030s
user 0m46.472s user 0m29.575s user 0m30.125s
sys 0m0.226s sys 0m0.245s sys 0m0.678s
$ $ $
Intentionally making code slower (diff below) by
Before Radeon vii factor was 72.5, now it is 17174300/336942 = 50.97:
Diff: