sunhay · December 30, 2021 02:56
diff --git a/cbounce_results.md b/cbounce_results.md
diff --git a/cbounce_test.go b/cbounce_test.go
 package cbounce

 import (
 	"sync"
 	"testing"
 	"sync/atomic"
 )

 var sink uint64

 // Cache line bouncing via false sharing:
 //   - False sharing occurs when threads on different processors modify variables that reside on the same cache line.
 //   - This invalidates the cache line and forces an update, which hurts performance.
 //   per https://software.intel.com/en-us/articles/avoiding-and-identifying-false-sharing-among-threads

 // Finding out your cache line size
 //   Mac:   $ sysctl hw.cachelinesize        -> 64
 //   Linux: $ getconf LEVEL1_DCACHE_LINESIZE -> 64

 type CacheBounce struct {
 	m []sync.Mutex
 }

 var cb CacheBounce

 func benchmarkCachelineBouncing(b *testing.B, numThreads int) {
 	cb = CacheBounce{m: make([]sync.Mutex, numThreads)}
 	wg := sync.WaitGroup{}

 	b.ResetTimer()
 	for i := 0; i < numThreads; i++ {
 		wg.Add(1)
 		go func(i int, c uint64) {
 			for j := 0; j < b.N; j++ {
 				cb.m[i].Lock()
 				c++
 				cb.m[i].Unlock()
 			}
 			atomic.AddUint64(&sink, c) // To make sure the loops aren't being optimized out
 			wg.Done()
 		}(i, 0)
 	}
 	wg.Wait()
 }

 type NoCacheBounce struct {
 	m []PaddedMutex
 }

 type PaddedMutex struct {
 	sync.Mutex  //   8 bytes
 	_ [7]uint64 // + 7 * 8 bytes
 }                   // = 64 bytes

 var ncb NoCacheBounce

 func benchNoCacheLineBouncing(b *testing.B, numThreads int) {
 	ncb = NoCacheBounce{m: make([]PaddedMutex, numThreads)}
 	wg := sync.WaitGroup{}

 	b.ResetTimer()
 	for i := 0; i < numThreads; i++ {
 		wg.Add(1)
 		go func(i int, c uint64) {
 			for j := 0; j < b.N; j++ {
 				ncb.m[i].Lock()
 				c++
 				ncb.m[i].Unlock()
 			}
 			atomic.AddUint64(&sink, c) // To make sure the loops aren't being optimized out
 			wg.Done()
 		}(i, 0)
 	}

 	wg.Wait()
 }

 func Benchmark1ThreadNoCacheLineBouncing(b *testing.B) {
 	benchNoCacheLineBouncing(b, 1)
 }

 func Benchmark1ThreadCacheLineBouncing(b *testing.B) {
 	benchmarkCachelineBouncing(b, 1)
 }

 func Benchmark2ThreadsNoCacheLineBouncing(b *testing.B) {
 	benchNoCacheLineBouncing(b, 2)
 }

 func Benchmark2ThreadsCacheLineBouncing(b *testing.B) {
 	benchmarkCachelineBouncing(b, 2)
 }

 func Benchmark4ThreadsNoCacheLineBouncing(b *testing.B) {
 	benchNoCacheLineBouncing(b, 4)
 }

 func Benchmark4ThreadsCacheLineBouncing(b *testing.B) {
 	benchmarkCachelineBouncing(b, 4)
 }

 func Benchmark8ThreadsNoCacheLineBouncing(b *testing.B) {
 	benchNoCacheLineBouncing(b, 8)
 }

 func Benchmark8ThreadsCacheLineBouncing(b *testing.B) {
 	benchmarkCachelineBouncing(b, 8)
 }
	package cbounce

	import (
	"sync"
	"testing"
	"sync/atomic"
	)

	var sink uint64

	// Cache line bouncing via false sharing:
	// - False sharing occurs when threads on different processors modify variables that reside on the same cache line.
	// - This invalidates the cache line and forces an update, which hurts performance.
	// per https://software.intel.com/en-us/articles/avoiding-and-identifying-false-sharing-among-threads

	// Finding out your cache line size
	// Mac: $ sysctl hw.cachelinesize -> 64
	// Linux: $ getconf LEVEL1_DCACHE_LINESIZE -> 64

	type CacheBounce struct {
	m []sync.Mutex
	}

	var cb CacheBounce

	func benchmarkCachelineBouncing(b *testing.B, numThreads int) {
	cb = CacheBounce{m: make([]sync.Mutex, numThreads)}
	wg := sync.WaitGroup{}

	b.ResetTimer()
	for i := 0; i < numThreads; i++ {
	wg.Add(1)
	go func(i int, c uint64) {
	for j := 0; j < b.N; j++ {
	cb.m[i].Lock()
	c++
	cb.m[i].Unlock()
	}
	atomic.AddUint64(&sink, c) // To make sure the loops aren't being optimized out
	wg.Done()
	}(i, 0)
	}
	wg.Wait()
	}

	type NoCacheBounce struct {
	m []PaddedMutex
	}

	type PaddedMutex struct {
	sync.Mutex // 8 bytes
	_ [7]uint64 // + 7 * 8 bytes
	} // = 64 bytes

	var ncb NoCacheBounce

	func benchNoCacheLineBouncing(b *testing.B, numThreads int) {
	ncb = NoCacheBounce{m: make([]PaddedMutex, numThreads)}
	wg := sync.WaitGroup{}

	b.ResetTimer()
	for i := 0; i < numThreads; i++ {
	wg.Add(1)
	go func(i int, c uint64) {
	for j := 0; j < b.N; j++ {
	ncb.m[i].Lock()
	c++
	ncb.m[i].Unlock()
	}
	atomic.AddUint64(&sink, c) // To make sure the loops aren't being optimized out
	wg.Done()
	}(i, 0)
	}

	wg.Wait()
	}

	func Benchmark1ThreadNoCacheLineBouncing(b *testing.B) {
	benchNoCacheLineBouncing(b, 1)
	}

	func Benchmark1ThreadCacheLineBouncing(b *testing.B) {
	benchmarkCachelineBouncing(b, 1)
	}

	func Benchmark2ThreadsNoCacheLineBouncing(b *testing.B) {
	benchNoCacheLineBouncing(b, 2)
	}

	func Benchmark2ThreadsCacheLineBouncing(b *testing.B) {
	benchmarkCachelineBouncing(b, 2)
	}

	func Benchmark4ThreadsNoCacheLineBouncing(b *testing.B) {
	benchNoCacheLineBouncing(b, 4)
	}

	func Benchmark4ThreadsCacheLineBouncing(b *testing.B) {
	benchmarkCachelineBouncing(b, 4)
	}

	func Benchmark8ThreadsNoCacheLineBouncing(b *testing.B) {
	benchNoCacheLineBouncing(b, 8)
	}

	func Benchmark8ThreadsCacheLineBouncing(b *testing.B) {
	benchmarkCachelineBouncing(b, 8)
	}