Last active
July 12, 2024 12:18
-
-
Save azat/7ea7f50ed75591b1af2d675a240ea94c to your computer and use it in GitHub Desktop.
Test CLOCK_MONOTONIC/CLOCK_MONOTONIC_RAW/rdtsc monotonicity
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <atomic> | |
#include <bits/types/clockid_t.h> | |
#include <cstdint> | |
#include <cstdlib> | |
#include <ctime> | |
#include <cerrno> | |
#include <iostream> | |
#include <list> | |
#include <random> | |
#include <sched.h> | |
#include <stdexcept> | |
#include <thread> | |
#include <utility> | |
#define THREADS 1 | |
#define VERBOSE_ERROR (THREADS==1) | |
#define GLOBAL_CLOCK_COUNTER (THREADS>100) | |
#define CLOCK_ITERATIONS 1'000ULL | |
/// Run via unshare -T, since docker does not suppor time ns [1]. | |
/// [1]: https://github.com/moby/moby/issues/39163 | |
#define NTP_THREAD_SYSTEMD 0 | |
#define NTP_THREAD_CHRONYD 1 | |
#define SCHED_YIELD (THREADS==1 || GLOBAL_CLOCK_COUNTER==0) | |
#define NSEC_PER_SEC (1000000000ULL) | |
#define NSEC_PER_MSEC (1000000LL) | |
__inline__ uint64_t rdtsc(void) | |
{ | |
uint32_t lo, hi; | |
__asm__ __volatile__ ( // serialize | |
"xorl %%eax,%%eax \n cpuid" | |
::: "%rax", "%rbx", "%rcx", "%rdx"); | |
/* We cannot use "=A", since this would use %rax on x86_64 and return only the lower 32bits of the TSC */ | |
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); | |
return (uint64_t)hi << 32 | lo; | |
} | |
timespec _clock_gettime(clockid_t clk) | |
{ | |
timespec ts; | |
if (clock_gettime(clk, &ts)) | |
{ | |
perror("clock_gettime"); | |
throw std::runtime_error("clock_gettime"); | |
} | |
return ts; | |
} | |
uint64_t now_ns(clockid_t clock) | |
{ | |
struct timespec ts = _clock_gettime(clock); | |
return ts.tv_sec * (uint64_t)1e9 + ts.tv_nsec; | |
} | |
template <class F, class ... Args> | |
uint64_t elapsed_ns(clockid_t clock, const F & f, Args && ... args) | |
{ | |
uint64_t start = now_ns(clock); | |
f(std::forward<Args>(args)...); | |
uint64_t end = now_ns(clock); | |
return end - start; | |
} | |
template <class F, class ... Args> | |
auto run(const F & f, Args && ... args) | |
{ | |
std::atomic_bool stopped = false; | |
/// FIXME: split for systemd vs chronyd | |
#if GLOBAL_CLOCK_COUNTER | |
std::atomic<uint64_t> counter = 0; | |
std::atomic<unsigned> cpu = -1; | |
#else | |
thread_local uint64_t counter = 0; | |
thread_local unsigned cpu = -1; | |
#endif | |
std::atomic<uint64_t> errors = 0; | |
#if NTP_THREAD_SYSTEMD | |
std::thread ntp_systemd([&]() { | |
size_t i = 0; | |
while (!stopped.load(std::memory_order_relaxed)) { | |
timex tmx{}; | |
tmx.modes = ADJ_STATUS | ADJ_NANO | ADJ_OFFSET | ADJ_TIMECONST | ADJ_MAXERROR | ADJ_ESTERROR; | |
// tmx.status = STA_PLL | STA_NANO | STA_MODE; | |
tmx.status = STA_PLL | STA_NANO; | |
if (rand() % 2) | |
tmx.offset = -rand()%500 * NSEC_PER_MSEC; | |
else | |
tmx.offset = rand()%500 * NSEC_PER_MSEC; | |
tmx.constant = 7; | |
timespec old_ts = _clock_gettime(CLOCK_REALTIME); | |
uint64_t took_ns = elapsed_ns(CLOCK_MONOTONIC_RAW, [&]() { | |
/// NOTE: clock_adjtime(CLOCK_REALTIME) is the same as adjtimex()/ntp_adjtime() | |
if (clock_adjtime(CLOCK_REALTIME, &tmx)) | |
{ | |
perror("clock_adjtime"); | |
throw std::runtime_error("clock_adjtime"); | |
} | |
}); | |
timespec now_ts = _clock_gettime(CLOCK_REALTIME); | |
if (i++ == 0) | |
fprintf(stderr, | |
" status : %04i %s\n" | |
" time was : %li.%03lli\n" | |
" adjtime time : %li.%03lli\n" | |
" time now : %li.%03lli\n" | |
" adjtime took : %li ns\n" | |
" constant : %li\n" | |
" offset : %+.3f sec\n" | |
" freq offset : %+li (%+li ppm)\n", | |
tmx.status, tmx.status & STA_UNSYNC ? "unsync" : "sync", | |
old_ts.tv_sec, old_ts.tv_nsec / NSEC_PER_SEC, | |
tmx.time.tv_sec, tmx.time.tv_usec / NSEC_PER_MSEC, | |
now_ts.tv_sec, now_ts.tv_nsec / NSEC_PER_SEC, | |
took_ns, | |
tmx.constant, | |
(double)tmx.offset / NSEC_PER_SEC, | |
tmx.freq, tmx.freq / 65536); | |
} | |
}); | |
#endif | |
#if NTP_THREAD_CHRONYD | |
std::default_random_engine generator; | |
std::thread ntp_chronyd([&]() { | |
size_t i = 0; | |
/// Set frequency offset from buf.freq. Since Linux 2.6.26, the supplied value is clamped to the range (-32768000, +32768000) | |
std::normal_distribution<double> freq_distribution(-32768000, +32768000); | |
/* std::normal_distribution<double> tick_distribution(9999, 10001); */ | |
while (!stopped.load(std::memory_order_relaxed)) { | |
timex tmx{}; | |
/* tmx.modes = ADJ_FREQUENCY|ADJ_TICK; */ | |
/* tmx.modes = ADJ_STATUS | ADJ_NANO | ADJ_OFFSET | ADJ_TIMECONST | ADJ_MAXERROR | ADJ_ESTERROR; */ | |
tmx.modes = ADJ_STATUS|ADJ_FREQUENCY; | |
/* if (rand() % 2) */ | |
/* tmx.status = STA_PLL | STA_NANO; */ | |
/* tmx.offset = (rand() % 2 ? -1 : 1)*rand()%500 * NSEC_PER_MSEC; */ | |
tmx.freq = freq_distribution(generator); | |
/* tmx.tick = tick_distribution(generator); */ | |
tmx.constant = 7; | |
timespec old_ts = _clock_gettime(CLOCK_REALTIME); | |
uint64_t took_ns = elapsed_ns(CLOCK_MONOTONIC_RAW, [&]() { | |
/// NOTE: clock_adjtime(CLOCK_REALTIME) is the same as adjtimex()/ntp_adjtime() | |
if (clock_adjtime(CLOCK_REALTIME, &tmx)) | |
{ | |
perror("clock_adjtime"); | |
throw std::runtime_error("clock_adjtime"); | |
} | |
}); | |
timespec now_ts = _clock_gettime(CLOCK_REALTIME); | |
fprintf(stderr, | |
"status: %04i %s, " | |
"time was: %li.%03lli, " | |
"adjtime time: %li.%03lli, " | |
"time now: %li.%03lli, " | |
"adjtime took: %li ns, " | |
"offset: %+.3f sec, " | |
"freq offset: %+li (%+li ppm)\n", | |
tmx.status, tmx.status & STA_UNSYNC ? "unsync" : "sync", | |
old_ts.tv_sec, old_ts.tv_nsec / NSEC_PER_SEC, | |
tmx.time.tv_sec, tmx.time.tv_usec / NSEC_PER_MSEC, | |
now_ts.tv_sec, now_ts.tv_nsec / NSEC_PER_SEC, | |
took_ns, | |
(double)tmx.offset / NSEC_PER_SEC, | |
tmx.freq, tmx.freq / 65536); | |
} | |
}); | |
#endif | |
std::list<std::thread> threads(THREADS); | |
for (auto & thread : threads) | |
{ | |
thread = std::thread([&]() { | |
for (size_t i = 0; i < CLOCK_ITERATIONS; ++i) | |
{ | |
#if SCHED_YIELD | |
if (!(rand()%10000)) | |
sched_yield(); | |
#endif | |
/// NOTE: racy, but should work | |
unsigned old_cpu = cpu; | |
uint64_t old_counter = counter; | |
counter = f(std::forward<Args>(args)...); | |
if (getcpu(&cpu, NULL)) | |
{ | |
perror("getcpu"); | |
break; | |
} | |
if (old_counter > counter) | |
{ | |
#if VERBOSE_ERROR | |
if (errors < 10) | |
fprintf(stderr, "ERROR: old_counter = %lu, new_counter = %lu, old CPU = %u, CPU = %u\n", old_counter, counter, old_cpu, cpu); | |
#endif | |
++errors; | |
} | |
} | |
}); | |
} | |
for (auto & thread : threads) | |
thread.join(); | |
stopped = true; | |
#if NTP_THREAD_SYSTEMD | |
ntp_systemd.join(); | |
#endif | |
#if NTP_THREAD_CHRONYD | |
ntp_chronyd.join(); | |
#endif | |
return errors.load(); | |
} | |
int main() | |
{ | |
std::cout << "THREADS = " << THREADS << "\n"; | |
std::cout << "GLOBAL_CLOCK_COUNTER = " << GLOBAL_CLOCK_COUNTER << "\n"; | |
std::cout << "CLOCK_ITERATIONS = " << CLOCK_ITERATIONS << "\n"; | |
std::cout << "NTP_THREAD_SYSTEMD = " << NTP_THREAD_SYSTEMD << "\n"; | |
std::cout << "NTP_THREAD_CHRONYD = " << NTP_THREAD_CHRONYD << "\n"; | |
std::cout << "SCHED_YIELD = " << SCHED_YIELD << "\n"; | |
uint64_t overall_errors = 0; | |
{ | |
std::cout << "rdtsc: "; | |
auto errors = run(rdtsc); | |
if (errors) | |
std::cout << "errors: " << errors << "\n"; | |
else | |
std::cout << "OK\n"; | |
overall_errors += errors; | |
} | |
{ | |
std::cout << "CLOCK_MONOTONIC: "; | |
auto errors = run(now_ns, CLOCK_MONOTONIC); | |
if (errors) | |
std::cout << "errors: " << errors << "\n"; | |
else | |
std::cout << "OK\n"; | |
overall_errors += errors; | |
} | |
{ | |
std::cout << "CLOCK_MONOTONIC_COARSE: "; | |
auto errors = run(now_ns, CLOCK_MONOTONIC_COARSE); | |
if (errors) | |
std::cout << "errors: " << errors << "\n"; | |
else | |
std::cout << "OK\n"; | |
overall_errors += errors; | |
} | |
{ | |
std::cout << "CLOCK_MONOTONIC_RAW: "; | |
auto errors = run(now_ns, CLOCK_MONOTONIC_RAW); | |
if (errors) | |
std::cout << "errors: " << errors << "\n"; | |
else | |
std::cout << "OK\n"; | |
overall_errors += errors; | |
} | |
return overall_errors > 0 ? EXIT_FAILURE : EXIT_SUCCESS; | |
} |
Author
azat
commented
Jul 11, 2024
One more failure with details:
THREADS = 1
GLOBAL_CLOCK_COUNTER = 0
CLOCK_ITERATIONS = 1000000
NTP_THREAD = 1
SCHED_YIELD = 1
status : 8193 sync
time was : 1720702927.000
adjtime time : 1720702927.847
time now : 1720702927.000
adjtime took : 4773 ns
constant : 7
offset : -0.386 sec
freq offset : +2345712 (+35 ppm)
rdtsc: OK
status : 8193 sync
time was : 1720702929.000
adjtime time : 1720702929.737
time now : 1720702929.000
adjtime took : 4094 ns
constant : 7
offset : +0.260 sec
freq offset : +2341056 (+35 ppm)
CLOCK_MONOTONIC: OK
status : 8193 sync
time was : 1720702929.000
adjtime time : 1720702929.876
time now : 1720702929.000
adjtime took : 3729 ns
constant : 7
offset : +0.485 sec
freq offset : +2341056 (+35 ppm)
CLOCK_MONOTONIC_COARSE: ERROR: old_counter = 8090141164178, new_counter = 8090141164177
ERROR: old_counter = 8090141164177, new_counter = 8090141164165
errors: 2
status : 8193 sync
time was : 1720702929.000
adjtime time : 1720702929.996
time now : 1720702929.000
adjtime took : 4103 ns
constant : 7
offset : +0.167 sec
freq offset : +2610085 (+39 ppm)
CLOCK_MONOTONIC_RAW: OK
xargs: sh: exited with status 255; aborting
real 46m10.366s
user 56m43.443s
sys 26m32.073s
And like I expected it failed even without clock_adjtime (NTP), what I wasn't expecting is equal CPUs, hm (it was a bug in a code)
THREADS = 1
GLOBAL_CLOCK_COUNTER = 0
CLOCK_ITERATIONS = 1000000
NTP_THREAD = 0
SCHED_YIELD = 1
rdtsc: OK
CLOCK_MONOTONIC: OK
CLOCK_MONOTONIC_COARSE: ERROR: old_counter = 10532883087019, new_counter = 10532883087018(old CPU = 4, CPU = 4)
errors: 1
CLOCK_MONOTONIC_RAW: OK
xargs: sh: exited with status 255; aborting
real 22m7.468s
user 21m43.815s
sys 0m1.185s
UPD: even after fixing bug with getcpu
, the CPU is still the same (race?):
THREADS = 1
GLOBAL_CLOCK_COUNTER = 0
CLOCK_ITERATIONS = 1000000
NTP_THREAD = 0
SCHED_YIELD = 1
rdtsc: OK
CLOCK_MONOTONIC: OK
CLOCK_MONOTONIC_COARSE: ERROR: old_counter = 10888448586573, new_counter = 10888448586563 (old CPU = 6, CPU = 6)
errors: 1
CLOCK_MONOTONIC_RAW: OK
xargs: sh: exited with status 255; aborting
real 1m43.967s
user 1m42.161s
sys 0m0.104s
Although I have chrony
running, let's try without it
Without chrono
running I cannot reproduce the issue (already for ~20 hours [1]), so it should be definitely clock adjustment in my case
[1]:
real 1069m27.351s
user 1050m48.394s
sys 0m56.961s
Seems that it has nothing to do with XEN, tried on GCP:
- n2-standard-64
- Linux b1 4.19.0-22-cloud-amd64 #1 SMP Debian 4.19.260-1 (2022-09-29) x86_64 GNU/Linux
- debian 10
- chronyd
- no
chl 'select 1'
in background
CLOCK_MONOTONIC_COARSE: ERROR: old_counter = 1232502913873, new_counter = 1232502913788 (old CPU = 7, CPU = 7)
errors: 1
Finally wrote a client with ADJ_FREQUENCY
(and this is indeed what chronyd
does) that reproduces the problem:
$ sudo unshare -T /tmp/test-clocks
status: 0000 sync, time was: 1720786185.000, adjtime time: 1720786185.000, time now: 1720786185.000, adjtime took: 5230 ns, constant: 7, offset: +0.004 sec, freq offset: -32768000 (-500 ppm)
status: 0000 sync, time was: 1720786185.000, adjtime time: 1720786185.000, time now: 1720786185.000, adjtime took: 2565 ns, constant: 7, offset: +0.004 sec, freq offset: -32768000 (-500 ppm)
ERROR: old_counter = 1273344844000358, new_counter = 1273344843999480, old CPU = 43, CPU = 43
status: 0001 sync, time was: 1720786185.000, adjtime time: 1720786185.000, time now: 1720786185.000, adjtime took: 2304 ns, constant: 7, offset: +0.004 sec, freq offset: -10345187 (-157 ppm)
status: 0001 sync, time was: 1720786185.000, adjtime time: 1720786185.000, time now: 1720786185.000, adjtime took: 2294 ns, constant: 7, offset: +0.004 sec, freq offset: -32768000 (-500 ppm)
status: 0001 sync, time was: 1720786185.000, adjtime time: 1720786185.000, time now: 1720786185.000, adjtime took: 2264 ns, constant: 7, offset: +0.004 sec, freq offset: -31677825 (-483 ppm)
ERROR: old_counter = 1273344844000324, new_counter = 1273344843999393, old CPU = 43, CPU = 43
status: 0001 sync, time was: 1720786185.000, adjtime time: 1720786185.000, time now: 1720786185.000, adjtime took: 2525 ns, constant: 7, offset: +0.004 sec, freq offset: -8361227 (-127 ppm)
status: 0000 sync, time was: 1720786185.000, adjtime time: 1720786185.000, time now: 1720786185.000, adjtime took: 2455 ns, constant: 7, offset: +0.004 sec, freq offset: -31666794 (-483 ppm)
status: 0000 sync, time was: 1720786185.000, adjtime time: 1720786185.000, time now: 1720786185.000, adjtime took: 2024 ns, constant: 7, offset: +0.004 sec, freq offset: -32768000 (-500 ppm)
status: 0001 sync, time was: 1720786185.000, adjtime time: 1720786185.000, time now: 1720786185.000, adjtime took: 3286 ns, constant: 7, offset: +0.004 sec, freq offset: -17611749 (-268 ppm)
CLOCK_MONOTONIC_COARSE: errors: 2
status: 0001 sync, time was: 1720786185.000, adjtime time: 1720786185.000, time now: 1720786185.000, adjtime took: 1743 ns, constant: 7, offset: +0.004 sec, freq offset: -32768000 (-500 ppm)
And this results from bare metal
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment