Last active
September 30, 2024 11:48
-
-
Save pmttavara/6f06fc5c7679c07375483b06bb77430c to your computer and use it in GitHub Desktop.
Obtain RDTSC frequency on Win32 and Linux
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// SPDX-FileCopyrightText: © 2022 Phillip Trudeau-Tavara <[email protected]> | |
// SPDX-License-Identifier: 0BSD | |
// https://hero.handmade.network/forums/code-discussion/t/7485-queryperformancefrequency_returning_10mhz_bug/2 | |
// https://learn.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/timers#partition-reference-tsc-mechanism | |
#include <stdbool.h> | |
#include <stdint.h> | |
#define WIN32_LEAN_AND_MEAN | |
#define VC_EXTRALEAN | |
#define NOMINMAX | |
#include <Windows.h> | |
#include <intrin.h> | |
static inline uint64_t get_rdtsc_freq(void) { | |
// Cache the answer so that multiple calls never take the slow path more than once | |
static uint64_t tsc_freq = 0; | |
if (tsc_freq) { | |
return tsc_freq; | |
} | |
// Fast path: Load kernel-mapped memory page | |
HMODULE ntdll = LoadLibraryA("ntdll.dll"); | |
if (ntdll) { | |
int (*NtQuerySystemInformation)(int, void *, unsigned int, unsigned int *) = | |
(int (*)(int, void *, unsigned int, unsigned int *))GetProcAddress(ntdll, "NtQuerySystemInformation"); | |
if (NtQuerySystemInformation) { | |
volatile uint64_t *hypervisor_shared_page = NULL; | |
unsigned int size = 0; | |
// SystemHypervisorSharedPageInformation == 0xc5 | |
int result = (NtQuerySystemInformation)(0xc5, (void *)&hypervisor_shared_page, sizeof(hypervisor_shared_page), &size); | |
// success | |
if (size == sizeof(hypervisor_shared_page) && result >= 0) { | |
// docs say ReferenceTime = ((VirtualTsc * TscScale) >> 64) | |
// set ReferenceTime = 10000000 = 1 second @ 10MHz, solve for VirtualTsc | |
// => VirtualTsc = 10000000 / (TscScale >> 64) | |
tsc_freq = (10000000ull << 32) / (hypervisor_shared_page[1] >> 32); | |
// If your build configuration supports 128 bit arithmetic, do this: | |
// tsc_freq = ((unsigned __int128)10000000ull << (unsigned __int128)64ull) / hypervisor_shared_page[1]; | |
} | |
} | |
FreeLibrary(ntdll); | |
} | |
// Slow path | |
if (!tsc_freq) { | |
// Get time before sleep | |
uint64_t qpc_begin = 0; QueryPerformanceCounter((LARGE_INTEGER *)&qpc_begin); | |
uint64_t tsc_begin = __rdtsc(); | |
Sleep(2); | |
// Get time after sleep | |
uint64_t qpc_end = qpc_begin + 1; QueryPerformanceCounter((LARGE_INTEGER *)&qpc_end); | |
uint64_t tsc_end = __rdtsc(); | |
// Do the math to extrapolate the RDTSC ticks elapsed in 1 second | |
uint64_t qpc_freq = 0; QueryPerformanceFrequency((LARGE_INTEGER *)&qpc_freq); | |
tsc_freq = (tsc_end - tsc_begin) * qpc_freq / (qpc_end - qpc_begin); | |
} | |
// Failure case | |
if (!tsc_freq) { | |
tsc_freq = 1000000000; | |
} | |
return tsc_freq; | |
} | |
#include <stdio.h> | |
int main() { | |
printf("Calling get_rdtsc_freq() 1,000 times...\n"); | |
uint64_t start = __rdtsc(); | |
uint64_t freq = 0; | |
for (int i = 0; i < 1000; i++) { | |
freq = get_rdtsc_freq(); | |
} | |
uint64_t end = __rdtsc(); | |
printf("...took %f milliseconds.\n", (end - start) * 1000.0 / freq); | |
printf("RDTSC frequency is %lu (%.2f GHz).\n", (unsigned long)freq, freq * 0.000000001); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// SPDX-FileCopyrightText: © 2022 Phillip Trudeau-Tavara <[email protected]> | |
// SPDX-License-Identifier: 0BSD | |
// https://linux.die.net/man/2/perf_event_open | |
// https://stackoverflow.com/a/57835630 | |
#include <stdbool.h> | |
#include <stdint.h> | |
#include <sys/mman.h> | |
#include <linux/perf_event.h> | |
#include <time.h> | |
#include <unistd.h> | |
#include <x86intrin.h> | |
static inline uint64_t get_rdtsc_freq(void) { | |
// Cache the answer so that multiple calls never take the slow path more than once | |
static uint64_t tsc_freq = 0; | |
if (tsc_freq) { | |
return tsc_freq; | |
} | |
// Fast path: Load kernel-mapped memory page | |
struct perf_event_attr pe = {0}; | |
pe.type = PERF_TYPE_HARDWARE; | |
pe.size = sizeof(struct perf_event_attr); | |
pe.config = PERF_COUNT_HW_INSTRUCTIONS; | |
pe.disabled = 1; | |
pe.exclude_kernel = 1; | |
pe.exclude_hv = 1; | |
// __NR_perf_event_open == 298 (on x86_64) | |
int fd = syscall(298, &pe, 0, -1, -1, 0); | |
if (fd != -1) { | |
struct perf_event_mmap_page *pc = (struct perf_event_mmap_page *)mmap(NULL, 4096, PROT_READ, MAP_SHARED, fd, 0); | |
if (pc) { | |
// success | |
if (pc->cap_user_time == 1) { | |
// docs say nanoseconds = (tsc * time_mult) >> time_shift | |
// set nanoseconds = 1000000000 = 1 second in nanoseconds, solve for tsc | |
// => tsc = 1000000000 / (time_mult >> time_shift) | |
tsc_freq = (1000000000ull << (pc->time_shift / 2)) / (pc->time_mult >> (pc->time_shift - pc->time_shift / 2)); | |
// If your build configuration supports 128 bit arithmetic, do this: | |
// tsc_freq = ((__uint128_t)1000000000ull << (__uint128_t)pc->time_shift) / pc->time_mult; | |
} | |
munmap(pc, 4096); | |
} | |
close(fd); | |
} | |
// Slow path | |
if (!tsc_freq) { | |
// Get time before sleep | |
uint64_t nsc_begin = 0; { struct timespec t; if (!clock_gettime(CLOCK_MONOTONIC_RAW, &t)) nsc_begin = (uint64_t)t.tv_sec * 1000000000ull + t.tv_nsec; } | |
uint64_t tsc_begin = __rdtsc(); | |
usleep(10000); // 10ms gives ~4.5 digits of precision - the longer you sleep, the more precise you get | |
// Get time after sleep | |
uint64_t nsc_end = nsc_begin + 1; { struct timespec t; if (!clock_gettime(CLOCK_MONOTONIC_RAW, &t)) nsc_end = (uint64_t)t.tv_sec * 1000000000ull + t.tv_nsec; } | |
uint64_t tsc_end = __rdtsc(); | |
// Do the math to extrapolate the RDTSC ticks elapsed in 1 second | |
tsc_freq = (tsc_end - tsc_begin) * 1000000000 / (nsc_end - nsc_begin); | |
} | |
// Failure case | |
if (!tsc_freq) { | |
tsc_freq = 1000000000; | |
} | |
return tsc_freq; | |
} | |
#include <stdio.h> | |
int main() { | |
printf("Calling get_rdtsc_freq() 1,000 times...\n"); | |
uint64_t start = __rdtsc(); | |
uint64_t freq = 0; | |
for (int i = 0; i < 1000; i++) { | |
freq = get_rdtsc_freq(); | |
} | |
uint64_t end = __rdtsc(); | |
printf("...took %f milliseconds.\n", (end - start) * 1000.0 / freq); | |
printf("RDTSC frequency is %lu (%.2f GHz).\n", (unsigned long)freq, freq * 0.000000001); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Awesome. This literally dropped right as I was about to look more into this stuff for my own profiler. It was meant to be