Created
July 18, 2016 15:52
-
-
Save davedice/4ca7aa6769abcfd04d5c866fc930b9f7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Build: g++ -O3 -std=gnu++14 -m64 MonoTime.cc -lpthread -o MonoTime -mmemory-model=tso | |
// | |
// Dave Dice -- blogs.oracle.com/dave | |
#include <thread> | |
#include <chrono> | |
#include <iostream> | |
#include <vector> | |
#include <mutex> | |
#include <random> | |
#include <atomic> | |
#include <alloca.h> | |
#include <cxxabi.h> | |
#include <sys/time.h> | |
template<typename Fun> | |
inline auto operator+(std::mutex & L, Fun&& fn) -> decltype(fn()) { | |
std::lock_guard<std::mutex> Locker(L) ; | |
return fn() ; | |
} | |
// Ideally we'd use alignas(128), but that isn't handled gracefully by all compilers | |
#define CALIGNED __attribute__ ((aligned(128))) | |
struct _Aligner { | |
CALIGNED int a [0] ; | |
} ; | |
template<typename T> using Ref = T * ; | |
template<typename T> using AlignedType CALIGNED = T ; | |
using Av32t = CALIGNED volatile int32_t ; | |
static const auto LMax = [](auto x, auto y) { return (x >= y) ? x : y ; } ; | |
static const auto LMin = [](auto x, auto y) { return (x <= y) ? x : y ; } ; | |
template<typename T> | |
T CAS(std::atomic<T> * p, T Comparand, T Set) { | |
T cpy = Comparand ; | |
std::atomic_compare_exchange_strong (p, &cpy, Set) ; | |
return cpy ; | |
} | |
template<typename T> | |
T CASV(std::atomic<T> * p, T Comparand, T Set) { | |
std::atomic_compare_exchange_strong (p, &Comparand, Set) ; | |
return Comparand ; | |
} | |
static const auto LCAS = [](auto ARef, auto Cmp, auto Set) { | |
std::atomic_compare_exchange_strong (ARef, &Cmp, Set) ; | |
return Cmp ; | |
} ; | |
static void FullFence() { | |
std::atomic_thread_fence (std::memory_order_seq_cst) ; | |
} | |
static void CompilerFence() { | |
// Sequence point | |
std::atomic_thread_fence (std::memory_order_relaxed) ; | |
} | |
// MonoTime() : provides a non-retrograde causal clock | |
// Models HotSpot JVM nanoTime() implementation | |
// | |
// Remarks: | |
// * Note that the fetch of TMax is SC by default : memory_order_seq_cst. | |
// A weak load might suffice as the value is validated via CAS | |
// and CAS has strong fence semantics. | |
// * We use an anonymous union to sequester and isolate TMax as sole occupant | |
// of its cache line. | |
// * MonoTime() uses an abstract base, so adding a small episilon is benign | |
// That is, the implement has latitute to add "Grain". | |
// MonoTime() is uncorrelated with other clocks - not comparable | |
// It should be used only for relative time, not wall-clock absolute time. | |
// * Note that the grain of the underlying gethrtime() clock source | |
// already influences the peak update rate. | |
// * A useful microoptimization is to LD Grain before the LD-CAS window | |
// * Being able to set Grain gives us a useful test to see if an application | |
// running on a system is sensitive to the nanoTime coherence hotspot. | |
// * Requires only a one-line changeset relative to baseline : add Grain | |
// * Assumes the skew-drift is extremely small | |
// * This example uses gethrtime() on Solaris | |
// We could also use "RD STICK" directly. | |
// Substitute your favor clock source accordingly. | |
// On modern Linux/x86 systems clock_gettime (CLOCK_MONOTONIC...) is a good choice, | |
// assuming it's implemented via VDSO. | |
// You might also use RDTSCP directly. | |
// * MonoTime() is expected to be causal in the following sense. | |
// Say thread A calls MonoTime() which returns value V. | |
// A stores V into some memory location. | |
// Thread B fetches from that location and observes the store, fetching V. | |
// B then call MonoTime() which returns W | |
// It should be the case that W >= V. | |
static hrtime_t Grain = 0 ; | |
static hrtime_t MonoTime() { | |
static union { std::atomic<hrtime_t> CALIGNED TMax {0} ; _Aligner Ax; } ; | |
hrtime_t now = gethrtime() ; | |
const hrtime_t tmax = TMax ; | |
if (tmax >= now) return tmax ; | |
// Quantize "now" value up to reduce subsequent update rate | |
// Attempt to reduce write rate into TMax to avoid coherence hotspot | |
// Grain value reflects trade-off between update rate and granularity of MonoTime() values | |
now += Grain ; | |
const hrtime_t v = LCAS (&TMax, tmax, now) ; | |
// Note that we can trivially make the following branch-free, but it's less readable. | |
if (v == tmax) return now ; | |
// claim : v > tmax | |
return v ; | |
} | |
int main (int argc, char * argv []) { | |
setbuf (stdout, NULL) ; | |
auto nthreads = atoi(argv[1]); | |
Grain = atoi (argv[2]) ; | |
printf ("%d threads on %d CPUs; Grain=%lld\n", | |
nthreads, | |
std::thread::hardware_concurrency(), | |
Grain) ; | |
double Duration = 10.0; | |
// beware of false sharing ... | |
std::mutex CALIGNED mut {}; | |
int64_t CALIGNED tally = 0 ; | |
volatile bool CALIGNED done = false; | |
std::vector<std::thread> threads; | |
for (int i = 0; i < nthreads; i++) { | |
threads.emplace_back(std::thread([&](){ | |
int64_t steps = 0 ; | |
hrtime_t sum = 0 ; | |
hrtime_t prv = MonoTime() ; | |
while (!done) { | |
++steps ; | |
const auto v = MonoTime() ; | |
if (v < prv) printf ("Error %llX-%llX %lld ", v, prv, prv-v) ; | |
prv = v ; | |
sum += v ; | |
} | |
if (sum == 0x11) printf ("!") ; | |
mut + [&] { tally += steps; } ; | |
printf ("%d ", steps) ; | |
})); | |
} | |
std::this_thread::sleep_for(std::chrono::duration<double>(Duration)); | |
done = true; | |
for (auto &t : threads) { | |
t.join(); | |
} | |
printf ("\n") ; | |
std::cout << tally << " counted in " << Duration << " seconds (" << (tally/Duration) << " per sec)\n"; | |
printf ("\n") ; | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment