Skip to content

Instantly share code, notes, and snippets.

@Hermann-SW
Created July 7, 2025 19:23
Show Gist options
  • Save Hermann-SW/5b76907bd20d2f64a7561b58c0b7a52d to your computer and use it in GitHub Desktop.
Save Hermann-SW/5b76907bd20d2f64a7561b58c0b7a52d to your computer and use it in GitHub Desktop.
Compare single thread pure integer performance of CPUs and [AMD] GPUs
/*
Compare single thread pure integer performance of CPUs and [AMD] GPUs
hipcc hip-magic.hip
cpplint --filter=-legal/copyright hip-magic.hip.cpp
cppcheck --enable=all --suppress=missingIncludeSystem hip-magic.hip.cpp --check-config
*/
#include <sys/time.h>
#include <stdint.h>
#include <hip/hip_runtime.h>
#define HIP_CHECK(condition) assert(condition == hipSuccess);
#define Prime(i) ((B[(i)>>5] & (0x80000000UL >> ((i)%32))) != 0)
#define forall_odd_primes_less_than(p, m, block) \
for ((p) = 3; (p) < (m); (p) += 2) \
if (Prime((p))) \
block
__global__ void gpuHelloWorld(int N, int L) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < N) {
uint32_t B[] = {0x35145105, 0x4510414, 0x11411040, 0x45144001};
uint8_t p, a, b, c, d;
for(int i=1; i <= L; ++i) {
forall_odd_primes_less_than(p, 64,
forall_odd_primes_less_than(a, p,
if Prime(2*p-a) {
forall_odd_primes_less_than(b, p,
if ( (b != a) && Prime(2*p-b) ) {
c = 3*p - (a+b);
if ( (c < 2*p) && (2*p-c != a) && (2*p-c != b) && Prime(c) && Prime(2*p-c) ) {
if (2*a+b > 2*p) {
d = 2*a + b - 2*p; // 3*p - (3*p-(a+b)) - (2*p-a)
if ( (d != a) && (d != b) && (d != 2*p-c) && Prime(d) && Prime(2*p-d) ) {
if (i == L)
printf("%3u|%3u|%3u|\n%3u|%3u|%3u|\n%3u|%3u|%3u|\n",
a, b, c, 2*p-d, p, d, 2*p-c, 2*p-b, 2*p-a);
goto fin;
}
}
}
}
)
}
)
)
fin: if (i == L) i=i;
}
}
}
int main(int argc, char *argv[]) {
int gridSize = 1; // #blocks
int blockSize = 1; // Threads per block
int N = gridSize * blockSize; // total #threads
int L = argc == 1 ? 10000 : atoi(argv[1]);
hipEvent_t start, stop;
HIP_CHECK(hipEventCreate(&start));
HIP_CHECK(hipEventCreate(&stop));
float elapsed_ms{};
HIP_CHECK(hipEventRecord(start, hipStreamDefault));
gpuHelloWorld<<<gridSize, blockSize>>>(N, L); // Launch kernel
HIP_CHECK(hipEventRecord(stop, hipStreamDefault));
HIP_CHECK(hipEventSynchronize(stop));
HIP_CHECK(hipEventElapsedTime(&elapsed_ms, start, stop));
printf("%.1f ms\n", elapsed_ms);
HIP_CHECK(hipEventDestroy(stop));
HIP_CHECK(hipEventDestroy(start));
struct timeval tv0, tv1;
gettimeofday(&tv1, NULL); // wait for usec change
do gettimeofday(&tv0, NULL); while (tv0.tv_usec == tv1.tv_usec);
uint32_t B[] = {0x35145105, 0x4510414, 0x11411040, 0x45144001};
uint8_t p, a, b, c, d;
for(int i=1; i <= L; ++i) {
forall_odd_primes_less_than(p, 64,
forall_odd_primes_less_than(a, p,
if Prime(2*p-a) {
forall_odd_primes_less_than(b, p,
if ( (b != a) && Prime(2*p-b) ) {
c = 3*p - (a+b);
if ( (c < 2*p) && (2*p-c != a) && (2*p-c != b) && Prime(c) && Prime(2*p-c) ) {
if (2*a+b > 2*p) {
d = 2*a + b - 2*p; // 3*p - (3*p-(a+b)) - (2*p-a)
if ( (d != a) && (d != b) && (d != 2*p-c) && Prime(d) && Prime(2*p-d) ) {
if (i == L)
printf("%3u|%3u|%3u|\n%3u|%3u|%3u|\n%3u|%3u|%3u|\n",
a, b, c, 2*p-d, p, d, 2*p-c, 2*p-b, 2*p-a);
goto fin;
}
}
}
}
)
}
)
)
fin: if (i == L) i=i;
}
gettimeofday(&tv1, NULL);
printf("%ldus\n",
1000000*(tv1.tv_sec-tv0.tv_sec)+tv1.tv_usec-tv0.tv_usec);
return 0;
}
@Hermann-SW
Copy link
Author

Intentionally making code slower (diff below) by

  • replacing uint8_t variables with float
  • introducing "Round(i)" macro needed to round and return int (for array index access)

Before Radeon vii factor was 72.5, now it is 17174300/336942 = 50.97:

hermann@Radeon-vii:~/rocm-examples$ ./a.out 100000
 47| 29|101|
113| 59|  5|
 17| 89| 71|
17174.3 ms
 47| 29|101|
113| 59|  5|
 17| 89| 71|
336942us
hermann@Radeon-vii:~/rocm-examples$ 

Diff:

hermann@Radeon-vii:~/rocm-examples$ diff hip-magic.hip hip-magic-f.hip 
6a7
> #define Round(f) ((int)(f + 0.5))
8c9
< #define Prime(i) ((B[(i)>>5] & (0x80000000UL >> ((i)%32))) != 0)
---
> #define Prime(i) ((B[(Round(i))>>5] & (0x80000000UL >> ((Round(i))%32))) != 0)
11c12
<   for ((p) = 3; (p) < (m); (p) += 2)              \
---
>   for ((p) = 3.0; (p) < (m); (p) += 2.0)              \
21c22
<         uint8_t p, a, b, c, d;
---
>         float p, a, b, c, d;
37c38
<                     printf("%3u|%3u|%3u|\n%3u|%3u|%3u|\n%3u|%3u|%3u|\n",
---
>                     printf("%3.0f|%3.0f|%3.0f|\n%3.0f|%3.0f|%3.0f|\n%3.0f|%3.0f|%3.0f|\n",
85c86
<     uint8_t p, a, b, c, d;
---
>     float p, a, b, c, d;
101c102
<                     printf("%3u|%3u|%3u|\n%3u|%3u|%3u|\n%3u|%3u|%3u|\n",
---
>                     printf("%3.0f|%3.0f|%3.0f|\n%3.0f|%3.0f|%3.0f|\n%3.0f|%3.0f|%3.0f|\n",
hermann@Radeon-vii:~/rocm-examples$ 

@Hermann-SW
Copy link
Author

Hermann-SW commented Jul 8, 2025

From:
https://rocm.blogs.amd.com/ecosystems-and-partners/rocm-revisited-hip/README.html#threads-wavefronts-blocks-grids

Thread: The smallest unit of execution. Each thread runs an instance of the kernel (GPU function) and performs calculations on a subset of data.

Block: A group of threads, often organized, can cooperate by sharing data in shared memory. Each block operates independently of others.

That means that each block (CU) can run different kernels at the same time.
So it is OK to split the many magic square computations to the 60 CUs of Radeon vii GPU.
Here the first thread of each CU is used.

Below code diff does that, executes L loops on each of the 60 CUs on GPU, or L*60 on CPU.
Now GPU is 7/6× faster than i3-6100 CPU (runtimes for 6 million magic square computations on GPU and CPU):

hermann@Radeon-vii:~/rocm-examples$ time ./a.out 100000
 47| 29|101|
113| 59|  5|
 17| 89| 71|
17465.1 ms
 47| 29|101|
113| 59|  5|
 17| 89| 71|
20195735us

real	0m40.245s
user	0m38.427s
sys	0m0.672s
hermann@Radeon-vii:~/rocm-examples$

Yes, i3-6100 CPU has 4 cores.
But running the same code in current example allows to run 64 threads per CU/block.
Which whould make the GPU 64/4*7/6=18.67× faster than i3-6100 CPU.

Diff:

hermann@Radeon-vii:~/rocm-examples$ diff hip-magic-f.hip hip-magic-f60.hip 
20c20
<     if (gid < N) {
---
>     if (gid < N && gid % blockDim.x == 0) {  // first thread of all blocks
37c37,38
<                   if (i == L)
---
>                   assert(a == 47 && b == 29);
>                   if (i == L && gid == 0) {
40c41
< 
---
>                   }
56,57c57,58
<     int gridSize = 1;  // #blocks
<     int blockSize = 1;  // Threads per block
---
>     int gridSize = 60;  // #blocks
>     int blockSize = 64;  // Threads per block
88c89
< for(int i=1; i <= L; ++i) {
---
> for(int i=1; i <= L*gridSize; ++i) {
101c102,103
<                   if (i == L)
---
>                   assert(a == 47 && b == 29);
>                   if (i == L*gridSize)
hermann@Radeon-vii:~/rocm-examples$ 

@Hermann-SW
Copy link
Author

New hip code
https://stamm-wilbrandt.de/forum/hip-magic-f21.hip

is the previous one, but utilizing only 21 CUs and not 60 CUs the Radeon vii CPU has:

hermann@Radeon-vii:~/rocm-examples$ diff hip-magic-f60.hip hip-magic-f21.hip 
57c57
<     int gridSize = 60;  // #blocks
---
>     int gridSize = 21;  // #blocks
hermann@Radeon-vii:~/rocm-examples$ 

This will be used to see that serialization happens when overloading the GPU.

As baseline, a single execution on 21 CUs/blocks at a time, with 17465.2 ms on GPU:

$ time ./a.out 100000
 47| 29|101|
113| 59|  5|
 17| 89| 71|
17465.2 ms
 47| 29|101|
113| 59|  5|
 17| 89| 71|
7143001us

real	0m26.161s
user	0m25.258s
sys	0m0.370s
$ 

I request 100,000/100,001/100,002 loops for 1st/2nd/3rd instance.
Next, two runs in parallel, both with 2same" runtime on GPU as before.
No surprise, since 60 CUs are available, and 2×21=42 CUs requested.
Nice that really no ven small penalty is to be paid for GPU runtimes:

$ time ./a.out 100000		    $ time ./a.out 100001
 47| 29|101|			     47| 29|101|
113| 59|  5|			    113| 59|  5|
 17| 89| 71|			     17| 89| 71|
17465.2 ms			    17464.2 ms
 47| 29|101|			     47| 29|101|
113| 59|  5|			    113| 59|  5|
 17| 89| 71|			     17| 89| 71|
7143001us			    7090378us
				    
real	0m26.161s		    real    0m27.484s
user	0m25.258s		    user    0m25.205s
sys	0m0.370s		    sys	    0m0.824s
$				    $

Last, but not least, running three in parallel exhausts the 60 CUs available on Radeon vii GPU.
3×21=63 CUs is too much, and here we see serialization happening for 1st instance with more than 35s GPU time:

$ time ./a.out 100000	$ time ./a.out 100001	$ time ./a.out 100002
 47| 29|101|		 47| 29|101|		 47| 29|101|
113| 59|  5|		113| 59|  5|		113| 59|  5|
 17| 89| 71|		 17| 89| 71|		 17| 89| 71|
38433.2 ms		17704.3 ms		22241.4 ms
 47| 29|101|		 47| 29|101|		 47| 29|101|
113| 59|  5|		113| 59|  5|		113| 59|  5|
 17| 89| 71|		 17| 89| 71|		 17| 89| 71|
7069767us		11209558us		7203511us
						
real	0m46.338s	real	0m29.913s	real	0m32.030s
user	0m46.472s	user	0m29.575s	user	0m30.125s
sys	0m0.226s	sys	0m0.245s	sys	0m0.678s
$			$			$

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment