sumerc · September 28, 2022 10:11
diff --git a/gistfile1.txt b/gistfile1.txt
 package main

 import (
 	"bytes"
 	"fmt"
 	"os"
 	//"runtime"
 	"runtime/pprof"
 	"strconv"
 	"sync"
 	"time"

 	pprofile "github.com/google/pprof/profile"
 )

 var wg sync.WaitGroup

 /*
 How I ran it:
 go build main2.go && sudo GOGC=off perf stat -d  ./main2 20 profile
 and
 go build main2.go && sudo GOGC=off perf stat -d  ./main2 20 profileoff
 and compare the elapsed time:
 here is an example result:
 pprof-overhead (main*) » go build main2.go && sudo GOGC=off perf stat -d  ./main2 20 profileoff                                             ~/Desktop/p/my-go-playground/pprof-overhead  
 Starting benchmark...
 Done. avg: 397.335839ms, min:391.252319ms, max:407.334504ms

 Performance counter stats for './main2 20 profileoff':

          8,202.59 msec task-clock                #    1.001 CPUs utilized          
             1,193      context-switches          #    0.145 K/sec                  
                 5      cpu-migrations            #    0.001 K/sec                  
         3,870,412      page-faults               #    0.472 M/sec                  
    36,335,529,823      cycles                    #    4.430 GHz                      (50.06%)
    97,220,434,963      instructions              #    2.68  insn per cycle           (62.58%)
    24,922,002,510      branches                  # 3038.307 M/sec                    (62.58%)
         8,874,605      branch-misses             #    0.04% of all branches          (62.58%)
    31,584,655,224      L1-dcache-loads           # 3850.569 M/sec                    (62.58%)
       260,937,935      L1-dcache-load-misses     #    0.83% of all L1-dcache hits    (62.42%)
         9,954,231      LLC-loads                 #    1.214 M/sec                    (49.90%)
         4,206,466      LLC-load-misses           #   42.26% of all LL-cache hits     (49.90%)

       8.196124365 seconds time elapsed

       5.419365000 seconds user
       2.789732000 seconds sys


 pprof-overhead (main*) » go build main2.go && sudo GOGC=off perf stat -d  ./main2 20 profile                                                ~/Desktop/p/my-go-playground/pprof-overhead  
 cpu profiler is on
 Starting benchmark...
 Done. avg: 402.060052ms, min:393.520596ms, max:413.33551ms

 Performance counter stats for './main2 20 profile':

          8,347.27 msec task-clock                #    0.981 CPUs utilized          
             1,682      context-switches          #    0.202 K/sec                  
                12      cpu-migrations            #    0.001 K/sec                  
         3,871,087      page-faults               #    0.464 M/sec                  
    36,663,980,903      cycles                    #    4.392 GHz                      (50.12%)
    97,169,632,931      instructions              #    2.65  insn per cycle           (62.62%)
    24,851,892,104      branches                  # 2977.247 M/sec                    (62.63%)
         9,278,597      branch-misses             #    0.04% of all branches          (62.66%)
    31,504,531,471      L1-dcache-loads           # 3774.231 M/sec                    (62.66%)
       523,097,299      L1-dcache-load-misses     #    1.66% of all L1-dcache hits    (62.33%)
        19,507,961      LLC-loads                 #    2.337 M/sec                    (49.82%)
         4,100,867      LLC-load-misses           #   21.02% of all LL-cache hits     (49.79%)

       8.511315596 seconds time elapsed

       5.498909000 seconds user
       2.863598000 seconds sys
 */

 /*
 > lscpu
 Architecture:        x86_64
 CPU op-mode(s):      32-bit, 64-bit
 Byte Order:          Little Endian
 CPU(s):              12
 On-line CPU(s) list: 0-11
 Thread(s) per core:  2
 Core(s) per socket:  6
 Socket(s):           1
 NUMA node(s):        1
 Vendor ID:           GenuineIntel
 CPU family:          6
 Model:               158
 Model name:          Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz
 Stepping:            10
 CPU MHz:             1556.894
 CPU max MHz:         4600.0000
 CPU min MHz:         800.0000
 BogoMIPS:            6399.96
 Virtualization:      VT-x
 L1d cache:           32K
 L1i cache:           32K
 L2 cache:            256K
 L3 cache:            12288K
 NUMA node0 CPU(s):   0-11
 Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb invpcid_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx rdseed adx smap clflushopt intel_pt xsaveopt xsavec xgetbv1 xsaves dtherm ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp md_clear flush_l1d arch_capabilities
 */

 const nrow = 400
 const ncol = 60

 var x [][]float32
 var y [][]float32
 var out [][]float32

 // - when out is local to the function it matters, because when it is global it will already be cached to the L2, I guess?
 // - increasing nrow/ncol > l2 cache mattered a bit. I saw consistent %2
 // - I get worse results when there are runtime.NumCPU() goroutines (more threads). So, a signal
 //   fairly selects one of these threads and make the CPU cache dirty. However, I expect same to happen on a single thread
 //   with GOMAXPROCS? But maybe: GOMAXPROCS already obsoletes the CPU cache because of concurrent runs of GC. So, I think
 //   having more threads is a better idea. Try GOMAXPROCS with GOGC=off ...

 func initMatrix() [][]float32 {
 	r := make([][]float32, nrow)
 	for i := 0; i < nrow; i++ {
 		r[i] = make([]float32, ncol)
 		// for j := 0; j < ncol; j++ {
 		// 	r[i][j] = float32(i + j)
 		// }
 	}
 	return r
 }

 func combinationMatrix() {
 	out := initMatrix()
 	for i := 0; i < len(x); i++ {
 		for j := 0; j < len(y[0]); j++ {
 			//for k := 0; k < len(y); k++ {
 			if out[i][j] == 0 { // increase branch misprediction
 				out[i][j] += x[i][j] + y[i][j]
 			}
 			//}
 		}
 	}
 }

 func recursiveTrampoline(n int) {
 	if n == 0 {
 		for i := 0; i < 4000; i++ {
 			combinationMatrix()
 			//fib(100000)
 		}
 		//fib(1000000000)
 		wg.Done()
 		return
 	}

 	recursiveTrampoline(n - 1)
 }

 func burnCores() {
 	// runtime.NumCPU()
 	for k := 0; k < 1; k++ {
 		wg.Add(1)
 		go recursiveTrampoline(64) // make sure stack size is at max
 		//go fib(10000000000)
 	}
 	wg.Wait()
 }

 func fib(n uint) uint {
 	if n < 2 {
 		return n
 	}

 	var a, b uint

 	b = 1

 	for n--; n > 0; n-- {
 		a += b
 		a, b = b, a
 	}

 	//fmt.Println(wg)
 	//wg.Done()
 	return b
 }

 func savePprofFromBuf(buf *bytes.Buffer, filename string) {
 	pprof, err := pprofile.Parse(buf)
 	if err != nil {
 		panic(err)
 	}

 	f, err := os.Create(filename)
 	if err != nil {
 		panic(err)
 	}
 	// Write the profile to the file.
 	if err := pprof.Write(f); err != nil {
 		panic(err)
 	}
 }

 const nwarmup = 1

 func main() {
 	var buf bytes.Buffer
 	var p bool
 	//u, _ := strconv.ParseUint(os.Args[1], 10, 64)
 	niter, _ := strconv.Atoi(os.Args[1])

 	p = false
 	if len(os.Args) > 2 && os.Args[2] == "profile" {
 		fmt.Println("cpu profiler is on")
 		//runtime.SetCPUProfileRate(10)
 		if err := pprof.StartCPUProfile(&buf); err != nil {
 			panic(err)
 		}
 		p = true
 	}

 	// init
 	x = initMatrix()
 	y = initMatrix()
 	//out = initMatrix()

 	// warmup - excluded from elapsed time
 	// fmt.Println("Warming up...")
 	// for i := 0; i < nwarmup; i++ {
 	// 	burnCores()
 	// }
 	// fmt.Println("Warmup done.")
 	

 	fmt.Println("Starting benchmark...")
 	var totelapsed, mine, maxe time.Duration
 	for i := 0; i < niter; i++ {
 		t0 := time.Now()
 		burnCores()
 		elapsed := time.Since(t0)
 		if elapsed < mine || mine == 0 {
 			mine = elapsed
 		}
 		if elapsed > maxe || maxe == 0 {
 			maxe = elapsed
 		}
 		totelapsed += elapsed
 	}

 	fmt.Println(fmt.Sprintf("Done. avg: %s, min:%s, max:%s", totelapsed/time.Duration(niter), mine, maxe))

 	if p {
 		pprof.StopCPUProfile()
 		savePprofFromBuf(&buf, "p2.pb.gz")
 	}
 }
	package main

	import (
	"bytes"
	"fmt"
	"os"
	//"runtime"
	"runtime/pprof"
	"strconv"
	"sync"
	"time"

	pprofile "github.com/google/pprof/profile"
	)

	var wg sync.WaitGroup

	/*
	How I ran it:
	go build main2.go && sudo GOGC=off perf stat -d ./main2 20 profile
	and
	go build main2.go && sudo GOGC=off perf stat -d ./main2 20 profileoff
	and compare the elapsed time:
	here is an example result:
	pprof-overhead (main*) » go build main2.go && sudo GOGC=off perf stat -d ./main2 20 profileoff ~/Desktop/p/my-go-playground/pprof-overhead
	Starting benchmark...
	Done. avg: 397.335839ms, min:391.252319ms, max:407.334504ms

	Performance counter stats for './main2 20 profileoff':

	8,202.59 msec task-clock # 1.001 CPUs utilized
	1,193 context-switches # 0.145 K/sec
	5 cpu-migrations # 0.001 K/sec
	3,870,412 page-faults # 0.472 M/sec
	36,335,529,823 cycles # 4.430 GHz (50.06%)
	97,220,434,963 instructions # 2.68 insn per cycle (62.58%)
	24,922,002,510 branches # 3038.307 M/sec (62.58%)
	8,874,605 branch-misses # 0.04% of all branches (62.58%)
	31,584,655,224 L1-dcache-loads # 3850.569 M/sec (62.58%)
	260,937,935 L1-dcache-load-misses # 0.83% of all L1-dcache hits (62.42%)
	9,954,231 LLC-loads # 1.214 M/sec (49.90%)
	4,206,466 LLC-load-misses # 42.26% of all LL-cache hits (49.90%)

	8.196124365 seconds time elapsed

	5.419365000 seconds user
	2.789732000 seconds sys


	pprof-overhead (main*) » go build main2.go && sudo GOGC=off perf stat -d ./main2 20 profile ~/Desktop/p/my-go-playground/pprof-overhead
	cpu profiler is on
	Starting benchmark...
	Done. avg: 402.060052ms, min:393.520596ms, max:413.33551ms

	Performance counter stats for './main2 20 profile':

	8,347.27 msec task-clock # 0.981 CPUs utilized
	1,682 context-switches # 0.202 K/sec
	12 cpu-migrations # 0.001 K/sec
	3,871,087 page-faults # 0.464 M/sec
	36,663,980,903 cycles # 4.392 GHz (50.12%)
	97,169,632,931 instructions # 2.65 insn per cycle (62.62%)
	24,851,892,104 branches # 2977.247 M/sec (62.63%)
	9,278,597 branch-misses # 0.04% of all branches (62.66%)
	31,504,531,471 L1-dcache-loads # 3774.231 M/sec (62.66%)
	523,097,299 L1-dcache-load-misses # 1.66% of all L1-dcache hits (62.33%)
	19,507,961 LLC-loads # 2.337 M/sec (49.82%)
	4,100,867 LLC-load-misses # 21.02% of all LL-cache hits (49.79%)

	8.511315596 seconds time elapsed

	5.498909000 seconds user
	2.863598000 seconds sys
	*/

	/*
	> lscpu
	Architecture: x86_64
	CPU op-mode(s): 32-bit, 64-bit
	Byte Order: Little Endian
	CPU(s): 12
	On-line CPU(s) list: 0-11
	Thread(s) per core: 2
	Core(s) per socket: 6
	Socket(s): 1
	NUMA node(s): 1
	Vendor ID: GenuineIntel
	CPU family: 6
	Model: 158
	Model name: Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz
	Stepping: 10
	CPU MHz: 1556.894
	CPU max MHz: 4600.0000
	CPU min MHz: 800.0000
	BogoMIPS: 6399.96
	Virtualization: VT-x
	L1d cache: 32K
	L1i cache: 32K
	L2 cache: 256K
	L3 cache: 12288K
	NUMA node0 CPU(s): 0-11
	Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb invpcid_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx rdseed adx smap clflushopt intel_pt xsaveopt xsavec xgetbv1 xsaves dtherm ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp md_clear flush_l1d arch_capabilities
	*/

	const nrow = 400
	const ncol = 60

	var x [][]float32
	var y [][]float32
	var out [][]float32

	// - when out is local to the function it matters, because when it is global it will already be cached to the L2, I guess?
	// - increasing nrow/ncol > l2 cache mattered a bit. I saw consistent %2
	// - I get worse results when there are runtime.NumCPU() goroutines (more threads). So, a signal
	// fairly selects one of these threads and make the CPU cache dirty. However, I expect same to happen on a single thread
	// with GOMAXPROCS? But maybe: GOMAXPROCS already obsoletes the CPU cache because of concurrent runs of GC. So, I think
	// having more threads is a better idea. Try GOMAXPROCS with GOGC=off ...

	func initMatrix() [][]float32 {
	r := make([][]float32, nrow)
	for i := 0; i < nrow; i++ {
	r[i] = make([]float32, ncol)
	// for j := 0; j < ncol; j++ {
	// r[i][j] = float32(i + j)
	// }
	}
	return r
	}

	func combinationMatrix() {
	out := initMatrix()
	for i := 0; i < len(x); i++ {
	for j := 0; j < len(y[0]); j++ {
	//for k := 0; k < len(y); k++ {
	if out[i][j] == 0 { // increase branch misprediction
	out[i][j] += x[i][j] + y[i][j]
	}
	//}
	}
	}
	}

	func recursiveTrampoline(n int) {
	if n == 0 {
	for i := 0; i < 4000; i++ {
	combinationMatrix()
	//fib(100000)
	}
	//fib(1000000000)
	wg.Done()
	return
	}

	recursiveTrampoline(n - 1)
	}

	func burnCores() {
	// runtime.NumCPU()
	for k := 0; k < 1; k++ {
	wg.Add(1)
	go recursiveTrampoline(64) // make sure stack size is at max
	//go fib(10000000000)
	}
	wg.Wait()
	}

	func fib(n uint) uint {
	if n < 2 {
	return n
	}

	var a, b uint

	b = 1

	for n--; n > 0; n-- {
	a += b
	a, b = b, a
	}

	//fmt.Println(wg)
	//wg.Done()
	return b
	}

	func savePprofFromBuf(buf *bytes.Buffer, filename string) {
	pprof, err := pprofile.Parse(buf)
	if err != nil {
	panic(err)
	}

	f, err := os.Create(filename)
	if err != nil {
	panic(err)
	}
	// Write the profile to the file.
	if err := pprof.Write(f); err != nil {
	panic(err)
	}
	}

	const nwarmup = 1

	func main() {
	var buf bytes.Buffer
	var p bool
	//u, _ := strconv.ParseUint(os.Args[1], 10, 64)
	niter, _ := strconv.Atoi(os.Args[1])

	p = false
	if len(os.Args) > 2 && os.Args[2] == "profile" {
	fmt.Println("cpu profiler is on")
	//runtime.SetCPUProfileRate(10)
	if err := pprof.StartCPUProfile(&buf); err != nil {
	panic(err)
	}
	p = true
	}

	// init
	x = initMatrix()
	y = initMatrix()
	//out = initMatrix()

	// warmup - excluded from elapsed time
	// fmt.Println("Warming up...")
	// for i := 0; i < nwarmup; i++ {
	// burnCores()
	// }
	// fmt.Println("Warmup done.")


	fmt.Println("Starting benchmark...")
	var totelapsed, mine, maxe time.Duration
	for i := 0; i < niter; i++ {
	t0 := time.Now()
	burnCores()
	elapsed := time.Since(t0)
	if elapsed < mine \|\| mine == 0 {
	mine = elapsed
	}
	if elapsed > maxe \|\| maxe == 0 {
	maxe = elapsed
	}
	totelapsed += elapsed
	}

	fmt.Println(fmt.Sprintf("Done. avg: %s, min:%s, max:%s", totelapsed/time.Duration(niter), mine, maxe))

	if p {
	pprof.StopCPUProfile()
	savePprofFromBuf(&buf, "p2.pb.gz")
	}
	}
No results found