Skip to content

Instantly share code, notes, and snippets.

@davedice
Created July 18, 2016 15:52
Show Gist options
  • Save davedice/4ca7aa6769abcfd04d5c866fc930b9f7 to your computer and use it in GitHub Desktop.
Save davedice/4ca7aa6769abcfd04d5c866fc930b9f7 to your computer and use it in GitHub Desktop.
// Build: g++ -O3 -std=gnu++14 -m64 MonoTime.cc -lpthread -o MonoTime -mmemory-model=tso
//
// Dave Dice -- blogs.oracle.com/dave
#include <thread>
#include <chrono>
#include <iostream>
#include <vector>
#include <mutex>
#include <random>
#include <atomic>
#include <alloca.h>
#include <cxxabi.h>
#include <sys/time.h>
template<typename Fun>
inline auto operator+(std::mutex & L, Fun&& fn) -> decltype(fn()) {
std::lock_guard<std::mutex> Locker(L) ;
return fn() ;
}
// Ideally we'd use alignas(128), but that isn't handled gracefully by all compilers
#define CALIGNED __attribute__ ((aligned(128)))
struct _Aligner {
CALIGNED int a [0] ;
} ;
template<typename T> using Ref = T * ;
template<typename T> using AlignedType CALIGNED = T ;
using Av32t = CALIGNED volatile int32_t ;
static const auto LMax = [](auto x, auto y) { return (x >= y) ? x : y ; } ;
static const auto LMin = [](auto x, auto y) { return (x <= y) ? x : y ; } ;
template<typename T>
T CAS(std::atomic<T> * p, T Comparand, T Set) {
T cpy = Comparand ;
std::atomic_compare_exchange_strong (p, &cpy, Set) ;
return cpy ;
}
template<typename T>
T CASV(std::atomic<T> * p, T Comparand, T Set) {
std::atomic_compare_exchange_strong (p, &Comparand, Set) ;
return Comparand ;
}
static const auto LCAS = [](auto ARef, auto Cmp, auto Set) {
std::atomic_compare_exchange_strong (ARef, &Cmp, Set) ;
return Cmp ;
} ;
static void FullFence() {
std::atomic_thread_fence (std::memory_order_seq_cst) ;
}
static void CompilerFence() {
// Sequence point
std::atomic_thread_fence (std::memory_order_relaxed) ;
}
// MonoTime() : provides a non-retrograde causal clock
// Models HotSpot JVM nanoTime() implementation
//
// Remarks:
// * Note that the fetch of TMax is SC by default : memory_order_seq_cst.
// A weak load might suffice as the value is validated via CAS
// and CAS has strong fence semantics.
// * We use an anonymous union to sequester and isolate TMax as sole occupant
// of its cache line.
// * MonoTime() uses an abstract base, so adding a small episilon is benign
// That is, the implement has latitute to add "Grain".
// MonoTime() is uncorrelated with other clocks - not comparable
// It should be used only for relative time, not wall-clock absolute time.
// * Note that the grain of the underlying gethrtime() clock source
// already influences the peak update rate.
// * A useful microoptimization is to LD Grain before the LD-CAS window
// * Being able to set Grain gives us a useful test to see if an application
// running on a system is sensitive to the nanoTime coherence hotspot.
// * Requires only a one-line changeset relative to baseline : add Grain
// * Assumes the skew-drift is extremely small
// * This example uses gethrtime() on Solaris
// We could also use "RD STICK" directly.
// Substitute your favor clock source accordingly.
// On modern Linux/x86 systems clock_gettime (CLOCK_MONOTONIC...) is a good choice,
// assuming it's implemented via VDSO.
// You might also use RDTSCP directly.
// * MonoTime() is expected to be causal in the following sense.
// Say thread A calls MonoTime() which returns value V.
// A stores V into some memory location.
// Thread B fetches from that location and observes the store, fetching V.
// B then call MonoTime() which returns W
// It should be the case that W >= V.
static hrtime_t Grain = 0 ;
static hrtime_t MonoTime() {
static union { std::atomic<hrtime_t> CALIGNED TMax {0} ; _Aligner Ax; } ;
hrtime_t now = gethrtime() ;
const hrtime_t tmax = TMax ;
if (tmax >= now) return tmax ;
// Quantize "now" value up to reduce subsequent update rate
// Attempt to reduce write rate into TMax to avoid coherence hotspot
// Grain value reflects trade-off between update rate and granularity of MonoTime() values
now += Grain ;
const hrtime_t v = LCAS (&TMax, tmax, now) ;
// Note that we can trivially make the following branch-free, but it's less readable.
if (v == tmax) return now ;
// claim : v > tmax
return v ;
}
int main (int argc, char * argv []) {
setbuf (stdout, NULL) ;
auto nthreads = atoi(argv[1]);
Grain = atoi (argv[2]) ;
printf ("%d threads on %d CPUs; Grain=%lld\n",
nthreads,
std::thread::hardware_concurrency(),
Grain) ;
double Duration = 10.0;
// beware of false sharing ...
std::mutex CALIGNED mut {};
int64_t CALIGNED tally = 0 ;
volatile bool CALIGNED done = false;
std::vector<std::thread> threads;
for (int i = 0; i < nthreads; i++) {
threads.emplace_back(std::thread([&](){
int64_t steps = 0 ;
hrtime_t sum = 0 ;
hrtime_t prv = MonoTime() ;
while (!done) {
++steps ;
const auto v = MonoTime() ;
if (v < prv) printf ("Error %llX-%llX %lld ", v, prv, prv-v) ;
prv = v ;
sum += v ;
}
if (sum == 0x11) printf ("!") ;
mut + [&] { tally += steps; } ;
printf ("%d ", steps) ;
}));
}
std::this_thread::sleep_for(std::chrono::duration<double>(Duration));
done = true;
for (auto &t : threads) {
t.join();
}
printf ("\n") ;
std::cout << tally << " counted in " << Duration << " seconds (" << (tally/Duration) << " per sec)\n";
printf ("\n") ;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment