Boostibot · January 31, 2025 18:38 · mmozeiko · Jan 31, 2025
diff --git a/clock_realtime.c b/clock_realtime.c
 #include <stdatomic.h>
 #include <stdint.h>
 #include <time.h>
 #include <stdio.h>
 #include <string.h>

 typedef struct vdso_timestamp_t {
    uint64_t sec;
    uint64_t nsec;
 } vdso_timestamp_t;

 #define CS_HRES_COARSE	0
 #define CS_RAW		1
 #define CS_BASES	(CS_RAW + 1)
 typedef struct vdso_data_t {
    uint32_t seq;

    int32_t  clock_mode;
    uint64_t cycle_last;
    uint64_t mask;
    uint32_t mult;
    uint32_t shift;

    union {
        vdso_timestamp_t basetime[12];
        //struct timens_offset offset[12];
    };

    int32_t  tz_minuteswest;
    int32_t  tz_dsttime;
    uint32_t hrtimer_res;
    uint32_t __unused;
 } vdso_data_t;

 typedef struct Clock_Info {
    vdso_timestamp_t* offset_ptr;
    vdso_data_t* data;
    vdso_data_t* coarse;
    uint64_t nominal_freq;
    uint64_t mult;
    uint64_t init_state;
 } Clock_Info;

 #define CLOCK_REALTIME				0
 #define CLOCK_MONOTONIC             1
 #define CLOCK_PROCESS_CPUTIME_ID    2
 #define CLOCK_THREAD_CPUTIME_ID     3
 #define CLOCK_MONOTONIC_RAW         4
 #define CLOCK_REALTIME_COARSE       5
 #define CLOCK_MONOTONIC_COARSE      6
 #define CLOCK_BOOTTIME              7
 #define CLOCK_REALTIME_ALARM        8
 #define CLOCK_BOOTTIME_ALARM        9
 #define CLOCK_SGI_CYCLE             10
 #define CLOCK_TAI                   11

 #include <x86intrin.h>
 Clock_Info global_clock_info = {0};
 void prepare_clock_info()
 {
    //Find where the vvar is mapped to within this process.
    // This is a special page thats is periodically updated by the kernel
    // with the precise time from some other hardware components which I havent looked into.
    //The raw TSC time is not fully accurate and drifts by like 1000ns every second (on my machine).
    //Because of this most things prefer to talk in terms of ns. If we get a timestep in ns 
    // we are sure that we are accurate to at least the frequency of the kernel updating the precise time
    // (and then some because we use TSC to "interpolate"). Then when we get some other timepoint in ns
    // and calculate the difference we know that the error is at max 2x the error of a single time point.
    // With TSC raw we have no such gurantee.

    // see: https://gist.github.com/mildsunrise/c63505931534bd3c0e143c0db8cad3f3
    uint8_t* vvar_addr = NULL;
    {
        // quickly parse /proc/self/maps to find [vvar] mapping
        char mmaps [4096*4] = {0};
        FILE* mmapsfile = fopen("/proc/self/maps", "r");
        if (!mmapsfile) 
            fprintf(stderr, "get_rdtsc_freq(): could not access own maps\n");
        else 
        {
            size_t nread = fread(mmaps, 1, sizeof(mmaps)-1, mmapsfile);
            fclose(mmapsfile);
            if(nread > 0)
            {
                for (char* line = mmaps; line != NULL;) {
                    char* next_line = strchr(line, '\n');
                    if (next_line != NULL) 
                        *(next_line++) = 0;

                    if (strstr(line, "[vvar]")) {
                        vvar_addr = (uint8_t*) (void*) strtoull(line, NULL, 16);
                        break;
                    }
                    line = next_line;
                }
            }
            if(vvar_addr == NULL)
                fprintf(stderr, "get_rdtsc_freq(): could not find [vvar] mapping\n");
        }
    }

    if(vvar_addr) {
        vdso_data_t *vdso_data = (vdso_data_t *)(vvar_addr + 128);

        //get the vdso data and calculate frequency of TSC - we use the CS_RAW vdso_data 
        // because the mult in CS_HRES_COARSE gets periodically updated by the kernel - we dont want that.
        //The kernel calucates ns from cycles using: ns = (cycles*mult) >> shift
        // so we just set ns = 1000000000 (1 second) and factor out cycles.
        uint64_t nominal_freq = (uint64_t) (((__uint128_t)1000000000ull << vdso_data[CS_RAW].shift) / vdso_data[CS_RAW].mult);

        //round to something reasonable to get rid of the inaccuracy from the mult-shift -> freq conversion
        uint64_t rounding = 2000;
        nominal_freq = (nominal_freq + rounding/2)/rounding*rounding; 

        //calculate windows style mult constant for 64 bit shift.
        //We will calculate ns from cycles using: ns = (cycles*mult) >> 64
        // which is fast because cpus can directly get the high part of 64 bit multiplcation.
        //We essentially treat mult as 64.64 bit fixed point.
        uint64_t mult_shift_64 = 0;
        {
            //see: https://elixir.bootlin.com/barebox/v2023.12.0/source/common/clock.c#L138
            uint64_t time_period = 1000000000;
            __uint128_t temp = ((__uint128_t)time_period << 64) + nominal_freq/2;
            mult_shift_64 = (uint64_t) (temp / nominal_freq);
        }

        global_clock_info.data = vdso_data;
        global_clock_info.offset_ptr = &vdso_data[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
        global_clock_info.coarse = &vdso_data[CS_HRES_COARSE];
        global_clock_info.mult = mult_shift_64;
        global_clock_info.nominal_freq = nominal_freq;
        global_clock_info.init_state = 1;
    }
 }

 uint64_t custom_clock_realtime_ns()
 {
    //despite this loop looking scary the raw __rdtsc() call takes like
    // 90% of the cpu time here. You can try benchmarking this function against
    // clock_gettime(CLOCK_REALTIME, &t) and raw __rdtsc() calls.

    //see: https://elixir.bootlin.com/linux/v5.9/source/lib/vdso/gettimeofday.c#L107

    //init and handle errors
    if(global_clock_info.init_state == 0)
    {
        prepare_clock_info();
        if(global_clock_info.init_state == 0)
            return 0;
    }

    //do the time reading

    //The kernel periodically goes around and updates all the relevant clock data.
    // namely it sets the actual real time as measure by some external hardware
    // and sets that instnatnt TSC. 
    //We can then read this real time and use it, only adjusting it by the ellapsed
    // time between current TSC and the TSC set by the kernel.
    //Also whenever the kernel updates this data it increments the seq variable.
    //We can use that to see if something has changed before we are done reading everythign. 
    // If it has we simply try again.
    uint32_t seq = global_clock_info.coarse->seq;
    for(;;) {
        atomic_thread_fence(memory_order_acquire);

        uint64_t now = __rdtsc();
        uint64_t offset_s = global_clock_info.offset_ptr->sec;
        uint64_t offset_ns = global_clock_info.offset_ptr->nsec;
        uint64_t last_clock = global_clock_info.coarse->cycle_last;

        atomic_thread_fence(memory_order_release);
        uint32_t new_seq = global_clock_info.coarse->seq;

        //if something has changed go again. 
        // If not do the actual computation
        if(new_seq == seq)
        {
            uint64_t diff = 0;
            if(now > last_clock)
                diff = now - last_clock;

            uint64_t diff_ns = (uint64_t) (((__uint128_t)diff * global_clock_info.mult) >> 64);
            uint64_t realtime_offset_ns = offset_s*1000000000ull + offset_ns;
            return realtime_offset_ns + diff_ns;
        }
        seq = new_seq;
    }
 }


 uint64_t custom_clock_realtime_ns_with_kernel_adjustements()
 {
    //This function is just like the prevous one except we take advantage of the
    // kernel TSC frequency scaling. As stated above the TSC drifts a bit and becomes
    // innacurate after some time. We fix it by periodically taking into count the 
    // real time.
    //However the kernel also tries to guess the "real" frequency of the TSC, by
    // simply comparing the current value of TSC and current time and determining 
    // what the frequency should be so that the two match up. For the kernel side see:
    // https://github.com/torvalds/linux/blob/d3d90cc2891c9cf4ecba7b85c0af716ab755c7e5/kernel/time/timekeeping.c#L1929
    //Luckily all we need to do is use the update values which are in CS_HRES_COARSE clock source.
    if(global_clock_info.init_state == 0)
    {
        prepare_clock_info();
        if(global_clock_info.init_state == 0)
            return 0;
    }

    uint32_t seq = global_clock_info.coarse->seq;
    for(;;) {
        atomic_thread_fence(memory_order_acquire);

        uint64_t now = __rdtsc();
        uint64_t offset_s = global_clock_info.offset_ptr->sec;
        uint64_t offset_ns = global_clock_info.offset_ptr->nsec;
        uint64_t mult = global_clock_info.data[CS_HRES_COARSE].mult;
        uint64_t shift = global_clock_info.data[CS_HRES_COARSE].shift;
        uint64_t last_clock = global_clock_info.coarse->cycle_last;

        atomic_thread_fence(memory_order_release);
        uint32_t new_seq = global_clock_info.coarse->seq;

        if(new_seq == seq)
        {
            uint64_t diff = 0;
            if(now > last_clock)
                diff = now - last_clock;

            uint64_t diff_ns = (diff*mult) >> shift;
            uint64_t realtime_offset_ns = offset_s*1000000000ull + offset_ns;
            return realtime_offset_ns + diff_ns;
        }
        seq = new_seq;
    }
 }


 #include <unistd.h>
 int main()
 {
    for(;;) {
        uint64_t realtime_custom = 0;
        uint64_t realtime_os = 0;

        struct timespec t;
        clock_gettime(CLOCK_REALTIME, &t);
        realtime_custom = custom_clock_realtime_ns();
        realtime_os = (uint64_t)t.tv_sec*1000000000ull + t.tv_nsec;

        printf("os:         %llu\n", (unsigned long long) realtime_os);
        printf("custom:     %llu\n", (unsigned long long) realtime_custom);
        printf("diff:       %lli\n", (long long) (realtime_os - realtime_custom));

        clock_gettime(CLOCK_REALTIME, &t);
        realtime_custom = custom_clock_realtime_ns_with_kernel_adjustements();
        realtime_os = (uint64_t)t.tv_sec*1000000000ull + t.tv_nsec;

        printf("os:         %llu\n", (unsigned long long) realtime_os);
        printf("custom_adj: %llu\n", (unsigned long long) realtime_custom);
        printf("diff:       %lli\n", (long long) (realtime_os - realtime_custom));
        usleep(500000);
    }
 }
	#include <stdatomic.h>
	#include <stdint.h>
	#include <time.h>
	#include <stdio.h>
	#include <string.h>

	typedef struct vdso_timestamp_t {
	uint64_t sec;
	uint64_t nsec;
	} vdso_timestamp_t;

	#define CS_HRES_COARSE 0
	#define CS_RAW 1
	#define CS_BASES (CS_RAW + 1)
	typedef struct vdso_data_t {
	uint32_t seq;

	int32_t clock_mode;
	uint64_t cycle_last;
	uint64_t mask;
	uint32_t mult;
	uint32_t shift;

	union {
	vdso_timestamp_t basetime[12];
	//struct timens_offset offset[12];
	};

	int32_t tz_minuteswest;
	int32_t tz_dsttime;
	uint32_t hrtimer_res;
	uint32_t __unused;
	} vdso_data_t;

	typedef struct Clock_Info {
	vdso_timestamp_t* offset_ptr;
	vdso_data_t* data;
	vdso_data_t* coarse;
	uint64_t nominal_freq;
	uint64_t mult;
	uint64_t init_state;
	} Clock_Info;

	#define CLOCK_REALTIME 0
	#define CLOCK_MONOTONIC 1
	#define CLOCK_PROCESS_CPUTIME_ID 2
	#define CLOCK_THREAD_CPUTIME_ID 3
	#define CLOCK_MONOTONIC_RAW 4
	#define CLOCK_REALTIME_COARSE 5
	#define CLOCK_MONOTONIC_COARSE 6
	#define CLOCK_BOOTTIME 7
	#define CLOCK_REALTIME_ALARM 8
	#define CLOCK_BOOTTIME_ALARM 9
	#define CLOCK_SGI_CYCLE 10
	#define CLOCK_TAI 11

	#include <x86intrin.h>
	Clock_Info global_clock_info = {0};
	void prepare_clock_info()
	{
	//Find where the vvar is mapped to within this process.
	// This is a special page thats is periodically updated by the kernel
	// with the precise time from some other hardware components which I havent looked into.
	//The raw TSC time is not fully accurate and drifts by like 1000ns every second (on my machine).
	//Because of this most things prefer to talk in terms of ns. If we get a timestep in ns
	// we are sure that we are accurate to at least the frequency of the kernel updating the precise time
	// (and then some because we use TSC to "interpolate"). Then when we get some other timepoint in ns
	// and calculate the difference we know that the error is at max 2x the error of a single time point.
	// With TSC raw we have no such gurantee.

	// see: https://gist.github.com/mildsunrise/c63505931534bd3c0e143c0db8cad3f3
	uint8_t* vvar_addr = NULL;
	{
	// quickly parse /proc/self/maps to find [vvar] mapping
	char mmaps [4096*4] = {0};
	FILE* mmapsfile = fopen("/proc/self/maps", "r");
	if (!mmapsfile)
	fprintf(stderr, "get_rdtsc_freq(): could not access own maps\n");
	else
	{
	size_t nread = fread(mmaps, 1, sizeof(mmaps)-1, mmapsfile);
	fclose(mmapsfile);
	if(nread > 0)
	{
	for (char* line = mmaps; line != NULL;) {
	char* next_line = strchr(line, '\n');
	if (next_line != NULL)
	*(next_line++) = 0;

	if (strstr(line, "[vvar]")) {
	vvar_addr = (uint8_t) (void) strtoull(line, NULL, 16);
	break;
	}
	line = next_line;
	}
	}
	if(vvar_addr == NULL)
	fprintf(stderr, "get_rdtsc_freq(): could not find [vvar] mapping\n");
	}
	}

	if(vvar_addr) {
	vdso_data_t vdso_data = (vdso_data_t )(vvar_addr + 128);

	//get the vdso data and calculate frequency of TSC - we use the CS_RAW vdso_data
	// because the mult in CS_HRES_COARSE gets periodically updated by the kernel - we dont want that.
	//The kernel calucates ns from cycles using: ns = (cycles*mult) >> shift
	// so we just set ns = 1000000000 (1 second) and factor out cycles.
	uint64_t nominal_freq = (uint64_t) (((__uint128_t)1000000000ull << vdso_data[CS_RAW].shift) / vdso_data[CS_RAW].mult);

	//round to something reasonable to get rid of the inaccuracy from the mult-shift -> freq conversion
	uint64_t rounding = 2000;
	nominal_freq = (nominal_freq + rounding/2)/rounding*rounding;

	//calculate windows style mult constant for 64 bit shift.
	//We will calculate ns from cycles using: ns = (cycles*mult) >> 64
	// which is fast because cpus can directly get the high part of 64 bit multiplcation.
	//We essentially treat mult as 64.64 bit fixed point.
	uint64_t mult_shift_64 = 0;
	{
	//see: https://elixir.bootlin.com/barebox/v2023.12.0/source/common/clock.c#L138
	uint64_t time_period = 1000000000;
	__uint128_t temp = ((__uint128_t)time_period << 64) + nominal_freq/2;
	mult_shift_64 = (uint64_t) (temp / nominal_freq);
	}

	global_clock_info.data = vdso_data;
	global_clock_info.offset_ptr = &vdso_data[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
	global_clock_info.coarse = &vdso_data[CS_HRES_COARSE];
	global_clock_info.mult = mult_shift_64;
	global_clock_info.nominal_freq = nominal_freq;
	global_clock_info.init_state = 1;
	}
	}

	uint64_t custom_clock_realtime_ns()
	{
	//despite this loop looking scary the raw __rdtsc() call takes like
	// 90% of the cpu time here. You can try benchmarking this function against
	// clock_gettime(CLOCK_REALTIME, &t) and raw __rdtsc() calls.

	//see: https://elixir.bootlin.com/linux/v5.9/source/lib/vdso/gettimeofday.c#L107

	//init and handle errors
	if(global_clock_info.init_state == 0)
	{
	prepare_clock_info();
	if(global_clock_info.init_state == 0)
	return 0;
	}

	//do the time reading

	//The kernel periodically goes around and updates all the relevant clock data.
	// namely it sets the actual real time as measure by some external hardware
	// and sets that instnatnt TSC.
	//We can then read this real time and use it, only adjusting it by the ellapsed
	// time between current TSC and the TSC set by the kernel.
	//Also whenever the kernel updates this data it increments the seq variable.
	//We can use that to see if something has changed before we are done reading everythign.
	// If it has we simply try again.
	uint32_t seq = global_clock_info.coarse->seq;
	for(;;) {
	atomic_thread_fence(memory_order_acquire);

	uint64_t now = __rdtsc();
	uint64_t offset_s = global_clock_info.offset_ptr->sec;
	uint64_t offset_ns = global_clock_info.offset_ptr->nsec;
	uint64_t last_clock = global_clock_info.coarse->cycle_last;

	atomic_thread_fence(memory_order_release);
	uint32_t new_seq = global_clock_info.coarse->seq;

	//if something has changed go again.
	// If not do the actual computation
	if(new_seq == seq)
	{
	uint64_t diff = 0;
	if(now > last_clock)
	diff = now - last_clock;

	uint64_t diff_ns = (uint64_t) (((__uint128_t)diff * global_clock_info.mult) >> 64);
	uint64_t realtime_offset_ns = offset_s*1000000000ull + offset_ns;
	return realtime_offset_ns + diff_ns;
	}
	seq = new_seq;
	}
	}


	uint64_t custom_clock_realtime_ns_with_kernel_adjustements()
	{
	//This function is just like the prevous one except we take advantage of the
	// kernel TSC frequency scaling. As stated above the TSC drifts a bit and becomes
	// innacurate after some time. We fix it by periodically taking into count the
	// real time.
	//However the kernel also tries to guess the "real" frequency of the TSC, by
	// simply comparing the current value of TSC and current time and determining
	// what the frequency should be so that the two match up. For the kernel side see:
	// https://github.com/torvalds/linux/blob/d3d90cc2891c9cf4ecba7b85c0af716ab755c7e5/kernel/time/timekeeping.c#L1929
	//Luckily all we need to do is use the update values which are in CS_HRES_COARSE clock source.
	if(global_clock_info.init_state == 0)
	{
	prepare_clock_info();
	if(global_clock_info.init_state == 0)
	return 0;
	}

	uint32_t seq = global_clock_info.coarse->seq;
	for(;;) {
	atomic_thread_fence(memory_order_acquire);

	uint64_t now = __rdtsc();
	uint64_t offset_s = global_clock_info.offset_ptr->sec;
	uint64_t offset_ns = global_clock_info.offset_ptr->nsec;
	uint64_t mult = global_clock_info.data[CS_HRES_COARSE].mult;
	uint64_t shift = global_clock_info.data[CS_HRES_COARSE].shift;
	uint64_t last_clock = global_clock_info.coarse->cycle_last;

	atomic_thread_fence(memory_order_release);
	uint32_t new_seq = global_clock_info.coarse->seq;

	if(new_seq == seq)
	{
	uint64_t diff = 0;
	if(now > last_clock)
	diff = now - last_clock;

	uint64_t diff_ns = (diff*mult) >> shift;
	uint64_t realtime_offset_ns = offset_s*1000000000ull + offset_ns;
	return realtime_offset_ns + diff_ns;
	}
	seq = new_seq;
	}
	}


	#include <unistd.h>
	int main()
	{
	for(;;) {
	uint64_t realtime_custom = 0;
	uint64_t realtime_os = 0;

	struct timespec t;
	clock_gettime(CLOCK_REALTIME, &t);
	realtime_custom = custom_clock_realtime_ns();
	realtime_os = (uint64_t)t.tv_sec*1000000000ull + t.tv_nsec;

	printf("os: %llu\n", (unsigned long long) realtime_os);
	printf("custom: %llu\n", (unsigned long long) realtime_custom);
	printf("diff: %lli\n", (long long) (realtime_os - realtime_custom));

	clock_gettime(CLOCK_REALTIME, &t);
	realtime_custom = custom_clock_realtime_ns_with_kernel_adjustements();
	realtime_os = (uint64_t)t.tv_sec*1000000000ull + t.tv_nsec;

	printf("os: %llu\n", (unsigned long long) realtime_os);
	printf("custom_adj: %llu\n", (unsigned long long) realtime_custom);
	printf("diff: %lli\n", (long long) (realtime_os - realtime_custom));
	usleep(500000);
	}
	}