Skip to content

Instantly share code, notes, and snippets.

@grigory-rechistov-intel
Created April 30, 2025 08:05
Show Gist options
  • Save grigory-rechistov-intel/51a71247ae519c85c0e33b2fb6e5d579 to your computer and use it in GitHub Desktop.
Save grigory-rechistov-intel/51a71247ae519c85c0e33b2fb6e5d579 to your computer and use it in GitHub Desktop.
/*
Adapted from answer https://stackoverflow.com/a/79598751/530714
to question "Example of a microbenchmark to demonstrate that code
inlining is not always beneficial to performance".
To build two variants of the code:
gcc -Wall -O3 -o inlinecost inlinecost.c
gcc -Wall -O3 -o inlinecost-unrolled inlinecost.c -DUNROLL
To run it and compare the latencies:
./inlinecost
./inlinecost-unrolled
*/
#include <stdint.h>
#include <inttypes.h>
#include <stdio.h>
#include <time.h>
// From https://prng.di.unimi.it/xoshiro256starstar.c
static inline uint64_t rotl(const uint64_t x, int k) {
return (x << k) | (x >> (64 - k));
}
static uint64_t s[4] = {1,2,3,4};
// A PRNG that will get inlined to generate lots of code
static uint64_t next(void) {
const uint64_t result = rotl(s[1] * 5, 7) * 9;
const uint64_t t = s[1] << 17;
s[2] ^= s[0];
s[3] ^= s[1];
s[1] ^= s[2];
s[0] ^= s[3];
s[2] ^= t;
s[3] = rotl(s[3], 45);
return result;
}
uint64_t benchmark() {
uint64_t sum = 0;
#ifdef UNROLL
#pragma GCC unroll 65534
#endif
for (int i = 0; i < 5000; i++) {
if (sum & 1) {
sum += next() >> 60;
} else {
sum += 1;
}
}
return sum;
}
int main() {
struct timespec t0, t1;
for (int i = 0; i < 10; i++) {
clock_gettime(CLOCK_MONOTONIC, &t0);
uint64_t res = benchmark();
clock_gettime(CLOCK_MONOTONIC, &t1);
double dt = t1.tv_sec - t0.tv_sec;
dt += (t1.tv_nsec - t0.tv_nsec) / 1e9;
printf("Took %.1f us to calculate %" PRIu64 "\n", dt * 1e6, res);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment