Skip to content

Instantly share code, notes, and snippets.

@Hermann-SW
Created July 7, 2025 19:23
Show Gist options
  • Save Hermann-SW/5b76907bd20d2f64a7561b58c0b7a52d to your computer and use it in GitHub Desktop.
Save Hermann-SW/5b76907bd20d2f64a7561b58c0b7a52d to your computer and use it in GitHub Desktop.
Compare single thread pure integer performance of CPUs and [AMD] GPUs
/*
Compare single thread pure integer performance of CPUs and [AMD] GPUs
hipcc hip-magic.hip
cpplint --filter=-legal/copyright hip-magic.hip.cpp
cppcheck --enable=all --suppress=missingIncludeSystem hip-magic.hip.cpp --check-config
*/
#include <sys/time.h>
#include <stdint.h>
#include <hip/hip_runtime.h>
#define HIP_CHECK(condition) assert(condition == hipSuccess);
#define Prime(i) ((B[(i)>>5] & (0x80000000UL >> ((i)%32))) != 0)
#define forall_odd_primes_less_than(p, m, block) \
for ((p) = 3; (p) < (m); (p) += 2) \
if (Prime((p))) \
block
__global__ void gpuHelloWorld(int N, int L) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < N) {
uint32_t B[] = {0x35145105, 0x4510414, 0x11411040, 0x45144001};
uint8_t p, a, b, c, d;
for(int i=1; i <= L; ++i) {
forall_odd_primes_less_than(p, 64,
forall_odd_primes_less_than(a, p,
if Prime(2*p-a) {
forall_odd_primes_less_than(b, p,
if ( (b != a) && Prime(2*p-b) ) {
c = 3*p - (a+b);
if ( (c < 2*p) && (2*p-c != a) && (2*p-c != b) && Prime(c) && Prime(2*p-c) ) {
if (2*a+b > 2*p) {
d = 2*a + b - 2*p; // 3*p - (3*p-(a+b)) - (2*p-a)
if ( (d != a) && (d != b) && (d != 2*p-c) && Prime(d) && Prime(2*p-d) ) {
if (i == L)
printf("%3u|%3u|%3u|\n%3u|%3u|%3u|\n%3u|%3u|%3u|\n",
a, b, c, 2*p-d, p, d, 2*p-c, 2*p-b, 2*p-a);
goto fin;
}
}
}
}
)
}
)
)
fin: if (i == L) i=i;
}
}
}
int main(int argc, char *argv[]) {
int gridSize = 1; // #blocks
int blockSize = 1; // Threads per block
int N = gridSize * blockSize; // total #threads
int L = argc == 1 ? 10000 : atoi(argv[1]);
hipEvent_t start, stop;
HIP_CHECK(hipEventCreate(&start));
HIP_CHECK(hipEventCreate(&stop));
float elapsed_ms{};
HIP_CHECK(hipEventRecord(start, hipStreamDefault));
gpuHelloWorld<<<gridSize, blockSize>>>(N, L); // Launch kernel
HIP_CHECK(hipEventRecord(stop, hipStreamDefault));
HIP_CHECK(hipEventSynchronize(stop));
HIP_CHECK(hipEventElapsedTime(&elapsed_ms, start, stop));
printf("%.1f ms\n", elapsed_ms);
HIP_CHECK(hipEventDestroy(stop));
HIP_CHECK(hipEventDestroy(start));
struct timeval tv0, tv1;
gettimeofday(&tv1, NULL); // wait for usec change
do gettimeofday(&tv0, NULL); while (tv0.tv_usec == tv1.tv_usec);
uint32_t B[] = {0x35145105, 0x4510414, 0x11411040, 0x45144001};
uint8_t p, a, b, c, d;
for(int i=1; i <= L; ++i) {
forall_odd_primes_less_than(p, 64,
forall_odd_primes_less_than(a, p,
if Prime(2*p-a) {
forall_odd_primes_less_than(b, p,
if ( (b != a) && Prime(2*p-b) ) {
c = 3*p - (a+b);
if ( (c < 2*p) && (2*p-c != a) && (2*p-c != b) && Prime(c) && Prime(2*p-c) ) {
if (2*a+b > 2*p) {
d = 2*a + b - 2*p; // 3*p - (3*p-(a+b)) - (2*p-a)
if ( (d != a) && (d != b) && (d != 2*p-c) && Prime(d) && Prime(2*p-d) ) {
if (i == L)
printf("%3u|%3u|%3u|\n%3u|%3u|%3u|\n%3u|%3u|%3u|\n",
a, b, c, 2*p-d, p, d, 2*p-c, 2*p-b, 2*p-a);
goto fin;
}
}
}
}
)
}
)
)
fin: if (i == L) i=i;
}
gettimeofday(&tv1, NULL);
printf("%ldus\n",
1000000*(tv1.tv_sec-tv0.tv_sec)+tv1.tv_usec-tv0.tv_usec);
return 0;
}
@Hermann-SW
Copy link
Author

New hip code
https://stamm-wilbrandt.de/forum/hip-magic-f21.hip

is the previous one, but utilizing only 21 CUs and not 60 CUs the Radeon vii CPU has:

hermann@Radeon-vii:~/rocm-examples$ diff hip-magic-f60.hip hip-magic-f21.hip 
57c57
<     int gridSize = 60;  // #blocks
---
>     int gridSize = 21;  // #blocks
hermann@Radeon-vii:~/rocm-examples$ 

This will be used to see that serialization happens when overloading the GPU.

As baseline, a single execution on 21 CUs/blocks at a time, with 17465.2 ms on GPU:

$ time ./a.out 100000
 47| 29|101|
113| 59|  5|
 17| 89| 71|
17465.2 ms
 47| 29|101|
113| 59|  5|
 17| 89| 71|
7143001us

real	0m26.161s
user	0m25.258s
sys	0m0.370s
$ 

I request 100,000/100,001/100,002 loops for 1st/2nd/3rd instance.
Next, two runs in parallel, both with 2same" runtime on GPU as before.
No surprise, since 60 CUs are available, and 2×21=42 CUs requested.
Nice that really no ven small penalty is to be paid for GPU runtimes:

$ time ./a.out 100000		    $ time ./a.out 100001
 47| 29|101|			     47| 29|101|
113| 59|  5|			    113| 59|  5|
 17| 89| 71|			     17| 89| 71|
17465.2 ms			    17464.2 ms
 47| 29|101|			     47| 29|101|
113| 59|  5|			    113| 59|  5|
 17| 89| 71|			     17| 89| 71|
7143001us			    7090378us
				    
real	0m26.161s		    real    0m27.484s
user	0m25.258s		    user    0m25.205s
sys	0m0.370s		    sys	    0m0.824s
$				    $

Last, but not least, running three in parallel exhausts the 60 CUs available on Radeon vii GPU.
3×21=63 CUs is too much, and here we see serialization happening for 1st instance with more than 35s GPU time:

$ time ./a.out 100000	$ time ./a.out 100001	$ time ./a.out 100002
 47| 29|101|		 47| 29|101|		 47| 29|101|
113| 59|  5|		113| 59|  5|		113| 59|  5|
 17| 89| 71|		 17| 89| 71|		 17| 89| 71|
38433.2 ms		17704.3 ms		22241.4 ms
 47| 29|101|		 47| 29|101|		 47| 29|101|
113| 59|  5|		113| 59|  5|		113| 59|  5|
 17| 89| 71|		 17| 89| 71|		 17| 89| 71|
7069767us		11209558us		7203511us
						
real	0m46.338s	real	0m29.913s	real	0m32.030s
user	0m46.472s	user	0m29.575s	user	0m30.125s
sys	0m0.226s	sys	0m0.245s	sys	0m0.678s
$			$			$

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment