davedice · July 18, 2016 15:52
diff --git a/MonoTime.cc b/MonoTime.cc
 // Build: g++ -O3 -std=gnu++14 -m64 MonoTime.cc -lpthread -o MonoTime -mmemory-model=tso
 //
 // Dave Dice -- blogs.oracle.com/dave

 #include <thread>
 #include <chrono>
 #include <iostream>
 #include <vector>
 #include <mutex>
 #include <random>
 #include <atomic>
 #include <alloca.h>
 #include <cxxabi.h>

 #include <sys/time.h>

 template<typename Fun>
 inline auto operator+(std::mutex & L, Fun&& fn) -> decltype(fn()) {
  std::lock_guard<std::mutex> Locker(L) ;
  return fn() ;
 }

 // Ideally we'd use alignas(128), but that isn't handled gracefully by all compilers
 #define CALIGNED __attribute__ ((aligned(128)))

 struct _Aligner {
  CALIGNED int a [0] ;
 } ; 

 template<typename T> using Ref = T * ; 
 template<typename T> using AlignedType CALIGNED = T ; 
 using Av32t = CALIGNED volatile int32_t ; 

 static const auto LMax = [](auto x, auto y) { return (x >= y) ? x : y ; } ; 
 static const auto LMin = [](auto x, auto y) { return (x <= y) ? x : y ; } ; 

 template<typename T>
 T CAS(std::atomic<T> * p, T Comparand, T Set) { 
  T cpy = Comparand ; 
  std::atomic_compare_exchange_strong (p, &cpy, Set) ; 
  return cpy ; 
 }

 template<typename T>
 T CASV(std::atomic<T> * p, T Comparand, T Set) { 
  std::atomic_compare_exchange_strong (p, &Comparand, Set) ; 
  return Comparand ; 
 }

 static const auto LCAS = [](auto ARef, auto Cmp, auto Set) { 
  std::atomic_compare_exchange_strong (ARef, &Cmp, Set) ; 
  return Cmp ; 
 } ; 

 static void FullFence() { 
  std::atomic_thread_fence (std::memory_order_seq_cst) ; 
 }
 
 static void CompilerFence() { 
  // Sequence point
  std::atomic_thread_fence (std::memory_order_relaxed) ; 
 }


 // MonoTime() : provides a non-retrograde causal clock
 // Models HotSpot JVM nanoTime() implementation
 //
 // Remarks:
 // *  Note that the fetch of TMax is SC by default : memory_order_seq_cst.
 //    A weak load might suffice as the value is validated via CAS
 //    and CAS has strong fence semantics.  
 // *  We use an anonymous union to sequester and isolate TMax as sole occupant
 //    of its cache line.  
 // *  MonoTime() uses an abstract base, so adding a small episilon is benign
 //    That is, the implement has latitute to add "Grain". 
 //    MonoTime() is uncorrelated with other clocks - not comparable
 //    It should be used only for relative time, not wall-clock absolute time.
 // *  Note that the grain of the underlying gethrtime() clock source
 //    already influences the peak update rate.  
 // *  A useful microoptimization is to LD Grain before the LD-CAS window
 // *  Being able to set Grain gives us a useful test to see if an application
 //    running on a system is sensitive to the nanoTime coherence hotspot. 
 // *  Requires only a one-line changeset relative to baseline : add Grain
 // *  Assumes the skew-drift is extremely small 
 // *  This example uses gethrtime() on Solaris 
 //    We could also use "RD STICK" directly.  
 //    Substitute your favor clock source accordingly.
 //    On modern Linux/x86 systems clock_gettime (CLOCK_MONOTONIC...) is a good choice,
 //    assuming it's implemented via VDSO.
 //    You might also use RDTSCP directly.  
 // *  MonoTime() is expected to be causal in the following sense.
 //    Say thread A calls MonoTime() which returns value V.
 //    A stores V into some memory location.
 //    Thread B fetches from that location and observes the store, fetching V.
 //    B then call MonoTime() which returns W
 //    It should be the case that W >= V.

 static hrtime_t Grain = 0 ; 

 static hrtime_t MonoTime() {
  static union { std::atomic<hrtime_t> CALIGNED TMax {0} ; _Aligner Ax; } ;
  hrtime_t now = gethrtime() ; 
  const hrtime_t tmax = TMax ; 
  if (tmax >= now) return tmax ; 
  // Quantize "now" value up to reduce subsequent update rate
  // Attempt to reduce write rate into TMax to avoid coherence hotspot
  // Grain value reflects trade-off between update rate and granularity of MonoTime() values
  now += Grain ; 
  const hrtime_t v = LCAS (&TMax, tmax, now) ; 
  // Note that we can trivially make the following branch-free, but it's less readable. 
  if (v == tmax) return now ; 
  // claim : v > tmax
  return v ; 
 }  

 int main (int argc, char * argv []) {
  setbuf (stdout, NULL) ; 
  auto nthreads = atoi(argv[1]);
  Grain = atoi (argv[2]) ; 
  printf ("%d threads on %d CPUs; Grain=%lld\n",
    nthreads, 
    std::thread::hardware_concurrency(),
    Grain) ; 

  double Duration = 10.0;       

  // beware of false sharing ...
  std::mutex CALIGNED mut {}; 
  int64_t CALIGNED tally   = 0 ; 
  volatile bool CALIGNED done = false;

  std::vector<std::thread> threads;
  for (int i = 0; i < nthreads; i++) {
    threads.emplace_back(std::thread([&](){
      int64_t steps = 0 ; 
      hrtime_t sum = 0 ; 
      hrtime_t prv = MonoTime() ; 
      while (!done) {
        ++steps ; 
        const auto v = MonoTime() ; 
        if (v < prv) printf ("Error %llX-%llX %lld ", v, prv, prv-v) ;
        prv = v ; 
        sum += v ; 
      }
      if (sum == 0x11) printf ("!") ; 
      mut + [&] { tally += steps; } ; 
      printf ("%d ", steps) ; 
    }));
  }

  std::this_thread::sleep_for(std::chrono::duration<double>(Duration));
  done = true;

  for (auto &t : threads) {
    t.join();
  }

  printf ("\n") ; 
  std::cout << tally << " counted in " << Duration << " seconds (" << (tally/Duration) << " per sec)\n";
  printf ("\n") ; 

  return 0;
 }
	// Build: g++ -O3 -std=gnu++14 -m64 MonoTime.cc -lpthread -o MonoTime -mmemory-model=tso
	//
	// Dave Dice -- blogs.oracle.com/dave

	#include <thread>
	#include <chrono>
	#include <iostream>
	#include <vector>
	#include <mutex>
	#include <random>
	#include <atomic>
	#include <alloca.h>
	#include <cxxabi.h>

	#include <sys/time.h>

	template<typename Fun>
	inline auto operator+(std::mutex & L, Fun&& fn) -> decltype(fn()) {
	std::lock_guard<std::mutex> Locker(L) ;
	return fn() ;
	}

	// Ideally we'd use alignas(128), but that isn't handled gracefully by all compilers
	#define CALIGNED __attribute__ ((aligned(128)))

	struct _Aligner {
	CALIGNED int a [0] ;
	} ;

	template<typename T> using Ref = T * ;
	template<typename T> using AlignedType CALIGNED = T ;
	using Av32t = CALIGNED volatile int32_t ;

	static const auto LMax = [](auto x, auto y) { return (x >= y) ? x : y ; } ;
	static const auto LMin = [](auto x, auto y) { return (x <= y) ? x : y ; } ;

	template<typename T>
	T CAS(std::atomic<T> * p, T Comparand, T Set) {
	T cpy = Comparand ;
	std::atomic_compare_exchange_strong (p, &cpy, Set) ;
	return cpy ;
	}

	template<typename T>
	T CASV(std::atomic<T> * p, T Comparand, T Set) {
	std::atomic_compare_exchange_strong (p, &Comparand, Set) ;
	return Comparand ;
	}

	static const auto LCAS = [](auto ARef, auto Cmp, auto Set) {
	std::atomic_compare_exchange_strong (ARef, &Cmp, Set) ;
	return Cmp ;
	} ;

	static void FullFence() {
	std::atomic_thread_fence (std::memory_order_seq_cst) ;
	}

	static void CompilerFence() {
	// Sequence point
	std::atomic_thread_fence (std::memory_order_relaxed) ;
	}


	// MonoTime() : provides a non-retrograde causal clock
	// Models HotSpot JVM nanoTime() implementation
	//
	// Remarks:
	// * Note that the fetch of TMax is SC by default : memory_order_seq_cst.
	// A weak load might suffice as the value is validated via CAS
	// and CAS has strong fence semantics.
	// * We use an anonymous union to sequester and isolate TMax as sole occupant
	// of its cache line.
	// * MonoTime() uses an abstract base, so adding a small episilon is benign
	// That is, the implement has latitute to add "Grain".
	// MonoTime() is uncorrelated with other clocks - not comparable
	// It should be used only for relative time, not wall-clock absolute time.
	// * Note that the grain of the underlying gethrtime() clock source
	// already influences the peak update rate.
	// * A useful microoptimization is to LD Grain before the LD-CAS window
	// * Being able to set Grain gives us a useful test to see if an application
	// running on a system is sensitive to the nanoTime coherence hotspot.
	// * Requires only a one-line changeset relative to baseline : add Grain
	// * Assumes the skew-drift is extremely small
	// * This example uses gethrtime() on Solaris
	// We could also use "RD STICK" directly.
	// Substitute your favor clock source accordingly.
	// On modern Linux/x86 systems clock_gettime (CLOCK_MONOTONIC...) is a good choice,
	// assuming it's implemented via VDSO.
	// You might also use RDTSCP directly.
	// * MonoTime() is expected to be causal in the following sense.
	// Say thread A calls MonoTime() which returns value V.
	// A stores V into some memory location.
	// Thread B fetches from that location and observes the store, fetching V.
	// B then call MonoTime() which returns W
	// It should be the case that W >= V.

	static hrtime_t Grain = 0 ;

	static hrtime_t MonoTime() {
	static union { std::atomic<hrtime_t> CALIGNED TMax {0} ; _Aligner Ax; } ;
	hrtime_t now = gethrtime() ;
	const hrtime_t tmax = TMax ;
	if (tmax >= now) return tmax ;
	// Quantize "now" value up to reduce subsequent update rate
	// Attempt to reduce write rate into TMax to avoid coherence hotspot
	// Grain value reflects trade-off between update rate and granularity of MonoTime() values
	now += Grain ;
	const hrtime_t v = LCAS (&TMax, tmax, now) ;
	// Note that we can trivially make the following branch-free, but it's less readable.
	if (v == tmax) return now ;
	// claim : v > tmax
	return v ;
	}

	int main (int argc, char * argv []) {
	setbuf (stdout, NULL) ;
	auto nthreads = atoi(argv[1]);
	Grain = atoi (argv[2]) ;
	printf ("%d threads on %d CPUs; Grain=%lld\n",
	nthreads,
	std::thread::hardware_concurrency(),
	Grain) ;

	double Duration = 10.0;

	// beware of false sharing ...
	std::mutex CALIGNED mut {};
	int64_t CALIGNED tally = 0 ;
	volatile bool CALIGNED done = false;

	std::vector<std::thread> threads;
	for (int i = 0; i < nthreads; i++) {
	threads.emplace_back(std::thread([&](){
	int64_t steps = 0 ;
	hrtime_t sum = 0 ;
	hrtime_t prv = MonoTime() ;
	while (!done) {
	++steps ;
	const auto v = MonoTime() ;
	if (v < prv) printf ("Error %llX-%llX %lld ", v, prv, prv-v) ;
	prv = v ;
	sum += v ;
	}
	if (sum == 0x11) printf ("!") ;
	mut + [&] { tally += steps; } ;
	printf ("%d ", steps) ;
	}));
	}

	std::this_thread::sleep_for(std::chrono::duration<double>(Duration));
	done = true;

	for (auto &t : threads) {
	t.join();
	}

	printf ("\n") ;
	std::cout << tally << " counted in " << Duration << " seconds (" << (tally/Duration) << " per sec)\n";
	printf ("\n") ;

	return 0;
	}
No results found