Skip to content

Instantly share code, notes, and snippets.

@sumerc
Last active September 28, 2022 10:11
Show Gist options
  • Save sumerc/35fe8c213fb7623360157f28be5aebd2 to your computer and use it in GitHub Desktop.
Save sumerc/35fe8c213fb7623360157f28be5aebd2 to your computer and use it in GitHub Desktop.
pprof benchmark code v1
package main
import (
"bytes"
"fmt"
"os"
//"runtime"
"runtime/pprof"
"strconv"
"sync"
"time"
pprofile "github.com/google/pprof/profile"
)
var wg sync.WaitGroup
/*
How I ran it:
go build main2.go && sudo GOGC=off perf stat -d ./main2 20 profile
and
go build main2.go && sudo GOGC=off perf stat -d ./main2 20 profileoff
and compare the elapsed time:
here is an example result:
pprof-overhead (main*) » go build main2.go && sudo GOGC=off perf stat -d ./main2 20 profileoff ~/Desktop/p/my-go-playground/pprof-overhead
Starting benchmark...
Done. avg: 397.335839ms, min:391.252319ms, max:407.334504ms
Performance counter stats for './main2 20 profileoff':
8,202.59 msec task-clock # 1.001 CPUs utilized
1,193 context-switches # 0.145 K/sec
5 cpu-migrations # 0.001 K/sec
3,870,412 page-faults # 0.472 M/sec
36,335,529,823 cycles # 4.430 GHz (50.06%)
97,220,434,963 instructions # 2.68 insn per cycle (62.58%)
24,922,002,510 branches # 3038.307 M/sec (62.58%)
8,874,605 branch-misses # 0.04% of all branches (62.58%)
31,584,655,224 L1-dcache-loads # 3850.569 M/sec (62.58%)
260,937,935 L1-dcache-load-misses # 0.83% of all L1-dcache hits (62.42%)
9,954,231 LLC-loads # 1.214 M/sec (49.90%)
4,206,466 LLC-load-misses # 42.26% of all LL-cache hits (49.90%)
8.196124365 seconds time elapsed
5.419365000 seconds user
2.789732000 seconds sys
pprof-overhead (main*) » go build main2.go && sudo GOGC=off perf stat -d ./main2 20 profile ~/Desktop/p/my-go-playground/pprof-overhead
cpu profiler is on
Starting benchmark...
Done. avg: 402.060052ms, min:393.520596ms, max:413.33551ms
Performance counter stats for './main2 20 profile':
8,347.27 msec task-clock # 0.981 CPUs utilized
1,682 context-switches # 0.202 K/sec
12 cpu-migrations # 0.001 K/sec
3,871,087 page-faults # 0.464 M/sec
36,663,980,903 cycles # 4.392 GHz (50.12%)
97,169,632,931 instructions # 2.65 insn per cycle (62.62%)
24,851,892,104 branches # 2977.247 M/sec (62.63%)
9,278,597 branch-misses # 0.04% of all branches (62.66%)
31,504,531,471 L1-dcache-loads # 3774.231 M/sec (62.66%)
523,097,299 L1-dcache-load-misses # 1.66% of all L1-dcache hits (62.33%)
19,507,961 LLC-loads # 2.337 M/sec (49.82%)
4,100,867 LLC-load-misses # 21.02% of all LL-cache hits (49.79%)
8.511315596 seconds time elapsed
5.498909000 seconds user
2.863598000 seconds sys
*/
/*
> lscpu
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 12
On-line CPU(s) list: 0-11
Thread(s) per core: 2
Core(s) per socket: 6
Socket(s): 1
NUMA node(s): 1
Vendor ID: GenuineIntel
CPU family: 6
Model: 158
Model name: Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz
Stepping: 10
CPU MHz: 1556.894
CPU max MHz: 4600.0000
CPU min MHz: 800.0000
BogoMIPS: 6399.96
Virtualization: VT-x
L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 12288K
NUMA node0 CPU(s): 0-11
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb invpcid_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx rdseed adx smap clflushopt intel_pt xsaveopt xsavec xgetbv1 xsaves dtherm ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp md_clear flush_l1d arch_capabilities
*/
const nrow = 400
const ncol = 60
var x [][]float32
var y [][]float32
var out [][]float32
// - when out is local to the function it matters, because when it is global it will already be cached to the L2, I guess?
// - increasing nrow/ncol > l2 cache mattered a bit. I saw consistent %2
// - I get worse results when there are runtime.NumCPU() goroutines (more threads). So, a signal
// fairly selects one of these threads and make the CPU cache dirty. However, I expect same to happen on a single thread
// with GOMAXPROCS? But maybe: GOMAXPROCS already obsoletes the CPU cache because of concurrent runs of GC. So, I think
// having more threads is a better idea. Try GOMAXPROCS with GOGC=off ...
func initMatrix() [][]float32 {
r := make([][]float32, nrow)
for i := 0; i < nrow; i++ {
r[i] = make([]float32, ncol)
// for j := 0; j < ncol; j++ {
// r[i][j] = float32(i + j)
// }
}
return r
}
func combinationMatrix() {
out := initMatrix()
for i := 0; i < len(x); i++ {
for j := 0; j < len(y[0]); j++ {
//for k := 0; k < len(y); k++ {
if out[i][j] == 0 { // increase branch misprediction
out[i][j] += x[i][j] + y[i][j]
}
//}
}
}
}
func recursiveTrampoline(n int) {
if n == 0 {
for i := 0; i < 4000; i++ {
combinationMatrix()
//fib(100000)
}
//fib(1000000000)
wg.Done()
return
}
recursiveTrampoline(n - 1)
}
func burnCores() {
// runtime.NumCPU()
for k := 0; k < 1; k++ {
wg.Add(1)
go recursiveTrampoline(64) // make sure stack size is at max
//go fib(10000000000)
}
wg.Wait()
}
func fib(n uint) uint {
if n < 2 {
return n
}
var a, b uint
b = 1
for n--; n > 0; n-- {
a += b
a, b = b, a
}
//fmt.Println(wg)
//wg.Done()
return b
}
func savePprofFromBuf(buf *bytes.Buffer, filename string) {
pprof, err := pprofile.Parse(buf)
if err != nil {
panic(err)
}
f, err := os.Create(filename)
if err != nil {
panic(err)
}
// Write the profile to the file.
if err := pprof.Write(f); err != nil {
panic(err)
}
}
const nwarmup = 1
func main() {
var buf bytes.Buffer
var p bool
//u, _ := strconv.ParseUint(os.Args[1], 10, 64)
niter, _ := strconv.Atoi(os.Args[1])
p = false
if len(os.Args) > 2 && os.Args[2] == "profile" {
fmt.Println("cpu profiler is on")
//runtime.SetCPUProfileRate(10)
if err := pprof.StartCPUProfile(&buf); err != nil {
panic(err)
}
p = true
}
// init
x = initMatrix()
y = initMatrix()
//out = initMatrix()
// warmup - excluded from elapsed time
// fmt.Println("Warming up...")
// for i := 0; i < nwarmup; i++ {
// burnCores()
// }
// fmt.Println("Warmup done.")
fmt.Println("Starting benchmark...")
var totelapsed, mine, maxe time.Duration
for i := 0; i < niter; i++ {
t0 := time.Now()
burnCores()
elapsed := time.Since(t0)
if elapsed < mine || mine == 0 {
mine = elapsed
}
if elapsed > maxe || maxe == 0 {
maxe = elapsed
}
totelapsed += elapsed
}
fmt.Println(fmt.Sprintf("Done. avg: %s, min:%s, max:%s", totelapsed/time.Duration(niter), mine, maxe))
if p {
pprof.StopCPUProfile()
savePprofFromBuf(&buf, "p2.pb.gz")
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment