Created
July 7, 2025 19:23
-
-
Save Hermann-SW/5b76907bd20d2f64a7561b58c0b7a52d to your computer and use it in GitHub Desktop.
Compare single thread pure integer performance of CPUs and [AMD] GPUs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Compare single thread pure integer performance of CPUs and [AMD] GPUs | |
hipcc hip-magic.hip | |
cpplint --filter=-legal/copyright hip-magic.hip.cpp | |
cppcheck --enable=all --suppress=missingIncludeSystem hip-magic.hip.cpp --check-config | |
*/ | |
#include <sys/time.h> | |
#include <stdint.h> | |
#include <hip/hip_runtime.h> | |
#define HIP_CHECK(condition) assert(condition == hipSuccess); | |
#define Prime(i) ((B[(i)>>5] & (0x80000000UL >> ((i)%32))) != 0) | |
#define forall_odd_primes_less_than(p, m, block) \ | |
for ((p) = 3; (p) < (m); (p) += 2) \ | |
if (Prime((p))) \ | |
block | |
__global__ void gpuHelloWorld(int N, int L) { | |
int gid = blockIdx.x * blockDim.x + threadIdx.x; | |
if (gid < N) { | |
uint32_t B[] = {0x35145105, 0x4510414, 0x11411040, 0x45144001}; | |
uint8_t p, a, b, c, d; | |
for(int i=1; i <= L; ++i) { | |
forall_odd_primes_less_than(p, 64, | |
forall_odd_primes_less_than(a, p, | |
if Prime(2*p-a) { | |
forall_odd_primes_less_than(b, p, | |
if ( (b != a) && Prime(2*p-b) ) { | |
c = 3*p - (a+b); | |
if ( (c < 2*p) && (2*p-c != a) && (2*p-c != b) && Prime(c) && Prime(2*p-c) ) { | |
if (2*a+b > 2*p) { | |
d = 2*a + b - 2*p; // 3*p - (3*p-(a+b)) - (2*p-a) | |
if ( (d != a) && (d != b) && (d != 2*p-c) && Prime(d) && Prime(2*p-d) ) { | |
if (i == L) | |
printf("%3u|%3u|%3u|\n%3u|%3u|%3u|\n%3u|%3u|%3u|\n", | |
a, b, c, 2*p-d, p, d, 2*p-c, 2*p-b, 2*p-a); | |
goto fin; | |
} | |
} | |
} | |
} | |
) | |
} | |
) | |
) | |
fin: if (i == L) i=i; | |
} | |
} | |
} | |
int main(int argc, char *argv[]) { | |
int gridSize = 1; // #blocks | |
int blockSize = 1; // Threads per block | |
int N = gridSize * blockSize; // total #threads | |
int L = argc == 1 ? 10000 : atoi(argv[1]); | |
hipEvent_t start, stop; | |
HIP_CHECK(hipEventCreate(&start)); | |
HIP_CHECK(hipEventCreate(&stop)); | |
float elapsed_ms{}; | |
HIP_CHECK(hipEventRecord(start, hipStreamDefault)); | |
gpuHelloWorld<<<gridSize, blockSize>>>(N, L); // Launch kernel | |
HIP_CHECK(hipEventRecord(stop, hipStreamDefault)); | |
HIP_CHECK(hipEventSynchronize(stop)); | |
HIP_CHECK(hipEventElapsedTime(&elapsed_ms, start, stop)); | |
printf("%.1f ms\n", elapsed_ms); | |
HIP_CHECK(hipEventDestroy(stop)); | |
HIP_CHECK(hipEventDestroy(start)); | |
struct timeval tv0, tv1; | |
gettimeofday(&tv1, NULL); // wait for usec change | |
do gettimeofday(&tv0, NULL); while (tv0.tv_usec == tv1.tv_usec); | |
uint32_t B[] = {0x35145105, 0x4510414, 0x11411040, 0x45144001}; | |
uint8_t p, a, b, c, d; | |
for(int i=1; i <= L; ++i) { | |
forall_odd_primes_less_than(p, 64, | |
forall_odd_primes_less_than(a, p, | |
if Prime(2*p-a) { | |
forall_odd_primes_less_than(b, p, | |
if ( (b != a) && Prime(2*p-b) ) { | |
c = 3*p - (a+b); | |
if ( (c < 2*p) && (2*p-c != a) && (2*p-c != b) && Prime(c) && Prime(2*p-c) ) { | |
if (2*a+b > 2*p) { | |
d = 2*a + b - 2*p; // 3*p - (3*p-(a+b)) - (2*p-a) | |
if ( (d != a) && (d != b) && (d != 2*p-c) && Prime(d) && Prime(2*p-d) ) { | |
if (i == L) | |
printf("%3u|%3u|%3u|\n%3u|%3u|%3u|\n%3u|%3u|%3u|\n", | |
a, b, c, 2*p-d, p, d, 2*p-c, 2*p-b, 2*p-a); | |
goto fin; | |
} | |
} | |
} | |
} | |
) | |
} | |
) | |
) | |
fin: if (i == L) i=i; | |
} | |
gettimeofday(&tv1, NULL); | |
printf("%ldus\n", | |
1000000*(tv1.tv_sec-tv0.tv_sec)+tv1.tv_usec-tv0.tv_usec); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
New hip code
https://stamm-wilbrandt.de/forum/hip-magic-f21.hip
is the previous one, but utilizing only 21 CUs and not 60 CUs the Radeon vii CPU has:
This will be used to see that serialization happens when overloading the GPU.
As baseline, a single execution on 21 CUs/blocks at a time, with 17465.2 ms on GPU:
I request 100,000/100,001/100,002 loops for 1st/2nd/3rd instance.
Next, two runs in parallel, both with 2same" runtime on GPU as before.
No surprise, since 60 CUs are available, and 2×21=42 CUs requested.
Nice that really no ven small penalty is to be paid for GPU runtimes:
Last, but not least, running three in parallel exhausts the 60 CUs available on Radeon vii GPU.
3×21=63 CUs is too much, and here we see serialization happening for 1st instance with more than 35s GPU time: