grigory-rechistov-intel · April 30, 2025 08:05
diff --git a/inlinecost.c b/inlinecost.c
 /*
 Adapted from answer https://stackoverflow.com/a/79598751/530714
 to question "Example of a microbenchmark to demonstrate that code
 inlining is not always beneficial to performance".

 To build two variants of the code:
 gcc -Wall -O3 -o inlinecost inlinecost.c
 gcc -Wall -O3  -o inlinecost-unrolled inlinecost.c -DUNROLL

 To run it and compare the latencies:
    ./inlinecost
    ./inlinecost-unrolled
 */

 #include <stdint.h>
 #include <inttypes.h>
 #include <stdio.h>
 #include <time.h>

 // From https://prng.di.unimi.it/xoshiro256starstar.c
 static inline uint64_t rotl(const uint64_t x, int k) {
    return (x << k) | (x >> (64 - k));
 }
 static uint64_t s[4] = {1,2,3,4};


 // A PRNG that will get inlined to generate lots of code
 static uint64_t next(void) {
    const uint64_t result = rotl(s[1] * 5, 7) * 9;

    const uint64_t t = s[1] << 17;

    s[2] ^= s[0];
    s[3] ^= s[1];
    s[1] ^= s[2];
    s[0] ^= s[3];

    s[2] ^= t;

    s[3] = rotl(s[3], 45);

    return result;
 }

 uint64_t benchmark() {
    uint64_t sum = 0;
 #ifdef UNROLL
 #pragma GCC unroll 65534
 #endif
    for (int i = 0; i < 5000; i++) {
        if (sum & 1) {
            sum += next() >> 60;
        } else {
            sum += 1;
        }
    }
    return sum;
 }

 int main() {
    struct timespec t0, t1;

    for (int i = 0; i < 10; i++) {
        clock_gettime(CLOCK_MONOTONIC, &t0);
        uint64_t res = benchmark();
        clock_gettime(CLOCK_MONOTONIC, &t1);

        double dt = t1.tv_sec - t0.tv_sec;
        dt += (t1.tv_nsec - t0.tv_nsec) / 1e9;

        printf("Took %.1f us to calculate %" PRIu64 "\n", dt * 1e6, res);
    }
    return 0;
 }
	/*
	Adapted from answer https://stackoverflow.com/a/79598751/530714
	to question "Example of a microbenchmark to demonstrate that code
	inlining is not always beneficial to performance".

	To build two variants of the code:
	gcc -Wall -O3 -o inlinecost inlinecost.c
	gcc -Wall -O3 -o inlinecost-unrolled inlinecost.c -DUNROLL

	To run it and compare the latencies:
	./inlinecost
	./inlinecost-unrolled
	*/

	#include <stdint.h>
	#include <inttypes.h>
	#include <stdio.h>
	#include <time.h>

	// From https://prng.di.unimi.it/xoshiro256starstar.c
	static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) \| (x >> (64 - k));
	}
	static uint64_t s[4] = {1,2,3,4};


	// A PRNG that will get inlined to generate lots of code
	static uint64_t next(void) {
	const uint64_t result = rotl(s[1] * 5, 7) * 9;

	const uint64_t t = s[1] << 17;

	s[2] ^= s[0];
	s[3] ^= s[1];
	s[1] ^= s[2];
	s[0] ^= s[3];

	s[2] ^= t;

	s[3] = rotl(s[3], 45);

	return result;
	}

	uint64_t benchmark() {
	uint64_t sum = 0;
	#ifdef UNROLL
	#pragma GCC unroll 65534
	#endif
	for (int i = 0; i < 5000; i++) {
	if (sum & 1) {
	sum += next() >> 60;
	} else {
	sum += 1;
	}
	}
	return sum;
	}

	int main() {
	struct timespec t0, t1;

	for (int i = 0; i < 10; i++) {
	clock_gettime(CLOCK_MONOTONIC, &t0);
	uint64_t res = benchmark();
	clock_gettime(CLOCK_MONOTONIC, &t1);

	double dt = t1.tv_sec - t0.tv_sec;
	dt += (t1.tv_nsec - t0.tv_nsec) / 1e9;

	printf("Took %.1f us to calculate %" PRIu64 "\n", dt * 1e6, res);
	}
	return 0;
	}