Last active
September 28, 2022 10:11
-
-
Save sumerc/35fe8c213fb7623360157f28be5aebd2 to your computer and use it in GitHub Desktop.
pprof benchmark code v1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bytes" | |
"fmt" | |
"os" | |
//"runtime" | |
"runtime/pprof" | |
"strconv" | |
"sync" | |
"time" | |
pprofile "github.com/google/pprof/profile" | |
) | |
var wg sync.WaitGroup | |
/* | |
How I ran it: | |
go build main2.go && sudo GOGC=off perf stat -d ./main2 20 profile | |
and | |
go build main2.go && sudo GOGC=off perf stat -d ./main2 20 profileoff | |
and compare the elapsed time: | |
here is an example result: | |
pprof-overhead (main*) » go build main2.go && sudo GOGC=off perf stat -d ./main2 20 profileoff ~/Desktop/p/my-go-playground/pprof-overhead | |
Starting benchmark... | |
Done. avg: 397.335839ms, min:391.252319ms, max:407.334504ms | |
Performance counter stats for './main2 20 profileoff': | |
8,202.59 msec task-clock # 1.001 CPUs utilized | |
1,193 context-switches # 0.145 K/sec | |
5 cpu-migrations # 0.001 K/sec | |
3,870,412 page-faults # 0.472 M/sec | |
36,335,529,823 cycles # 4.430 GHz (50.06%) | |
97,220,434,963 instructions # 2.68 insn per cycle (62.58%) | |
24,922,002,510 branches # 3038.307 M/sec (62.58%) | |
8,874,605 branch-misses # 0.04% of all branches (62.58%) | |
31,584,655,224 L1-dcache-loads # 3850.569 M/sec (62.58%) | |
260,937,935 L1-dcache-load-misses # 0.83% of all L1-dcache hits (62.42%) | |
9,954,231 LLC-loads # 1.214 M/sec (49.90%) | |
4,206,466 LLC-load-misses # 42.26% of all LL-cache hits (49.90%) | |
8.196124365 seconds time elapsed | |
5.419365000 seconds user | |
2.789732000 seconds sys | |
pprof-overhead (main*) » go build main2.go && sudo GOGC=off perf stat -d ./main2 20 profile ~/Desktop/p/my-go-playground/pprof-overhead | |
cpu profiler is on | |
Starting benchmark... | |
Done. avg: 402.060052ms, min:393.520596ms, max:413.33551ms | |
Performance counter stats for './main2 20 profile': | |
8,347.27 msec task-clock # 0.981 CPUs utilized | |
1,682 context-switches # 0.202 K/sec | |
12 cpu-migrations # 0.001 K/sec | |
3,871,087 page-faults # 0.464 M/sec | |
36,663,980,903 cycles # 4.392 GHz (50.12%) | |
97,169,632,931 instructions # 2.65 insn per cycle (62.62%) | |
24,851,892,104 branches # 2977.247 M/sec (62.63%) | |
9,278,597 branch-misses # 0.04% of all branches (62.66%) | |
31,504,531,471 L1-dcache-loads # 3774.231 M/sec (62.66%) | |
523,097,299 L1-dcache-load-misses # 1.66% of all L1-dcache hits (62.33%) | |
19,507,961 LLC-loads # 2.337 M/sec (49.82%) | |
4,100,867 LLC-load-misses # 21.02% of all LL-cache hits (49.79%) | |
8.511315596 seconds time elapsed | |
5.498909000 seconds user | |
2.863598000 seconds sys | |
*/ | |
/* | |
> lscpu | |
Architecture: x86_64 | |
CPU op-mode(s): 32-bit, 64-bit | |
Byte Order: Little Endian | |
CPU(s): 12 | |
On-line CPU(s) list: 0-11 | |
Thread(s) per core: 2 | |
Core(s) per socket: 6 | |
Socket(s): 1 | |
NUMA node(s): 1 | |
Vendor ID: GenuineIntel | |
CPU family: 6 | |
Model: 158 | |
Model name: Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz | |
Stepping: 10 | |
CPU MHz: 1556.894 | |
CPU max MHz: 4600.0000 | |
CPU min MHz: 800.0000 | |
BogoMIPS: 6399.96 | |
Virtualization: VT-x | |
L1d cache: 32K | |
L1i cache: 32K | |
L2 cache: 256K | |
L3 cache: 12288K | |
NUMA node0 CPU(s): 0-11 | |
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb invpcid_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx rdseed adx smap clflushopt intel_pt xsaveopt xsavec xgetbv1 xsaves dtherm ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp md_clear flush_l1d arch_capabilities | |
*/ | |
const nrow = 400 | |
const ncol = 60 | |
var x [][]float32 | |
var y [][]float32 | |
var out [][]float32 | |
// - when out is local to the function it matters, because when it is global it will already be cached to the L2, I guess? | |
// - increasing nrow/ncol > l2 cache mattered a bit. I saw consistent %2 | |
// - I get worse results when there are runtime.NumCPU() goroutines (more threads). So, a signal | |
// fairly selects one of these threads and make the CPU cache dirty. However, I expect same to happen on a single thread | |
// with GOMAXPROCS? But maybe: GOMAXPROCS already obsoletes the CPU cache because of concurrent runs of GC. So, I think | |
// having more threads is a better idea. Try GOMAXPROCS with GOGC=off ... | |
func initMatrix() [][]float32 { | |
r := make([][]float32, nrow) | |
for i := 0; i < nrow; i++ { | |
r[i] = make([]float32, ncol) | |
// for j := 0; j < ncol; j++ { | |
// r[i][j] = float32(i + j) | |
// } | |
} | |
return r | |
} | |
func combinationMatrix() { | |
out := initMatrix() | |
for i := 0; i < len(x); i++ { | |
for j := 0; j < len(y[0]); j++ { | |
//for k := 0; k < len(y); k++ { | |
if out[i][j] == 0 { // increase branch misprediction | |
out[i][j] += x[i][j] + y[i][j] | |
} | |
//} | |
} | |
} | |
} | |
func recursiveTrampoline(n int) { | |
if n == 0 { | |
for i := 0; i < 4000; i++ { | |
combinationMatrix() | |
//fib(100000) | |
} | |
//fib(1000000000) | |
wg.Done() | |
return | |
} | |
recursiveTrampoline(n - 1) | |
} | |
func burnCores() { | |
// runtime.NumCPU() | |
for k := 0; k < 1; k++ { | |
wg.Add(1) | |
go recursiveTrampoline(64) // make sure stack size is at max | |
//go fib(10000000000) | |
} | |
wg.Wait() | |
} | |
func fib(n uint) uint { | |
if n < 2 { | |
return n | |
} | |
var a, b uint | |
b = 1 | |
for n--; n > 0; n-- { | |
a += b | |
a, b = b, a | |
} | |
//fmt.Println(wg) | |
//wg.Done() | |
return b | |
} | |
func savePprofFromBuf(buf *bytes.Buffer, filename string) { | |
pprof, err := pprofile.Parse(buf) | |
if err != nil { | |
panic(err) | |
} | |
f, err := os.Create(filename) | |
if err != nil { | |
panic(err) | |
} | |
// Write the profile to the file. | |
if err := pprof.Write(f); err != nil { | |
panic(err) | |
} | |
} | |
const nwarmup = 1 | |
func main() { | |
var buf bytes.Buffer | |
var p bool | |
//u, _ := strconv.ParseUint(os.Args[1], 10, 64) | |
niter, _ := strconv.Atoi(os.Args[1]) | |
p = false | |
if len(os.Args) > 2 && os.Args[2] == "profile" { | |
fmt.Println("cpu profiler is on") | |
//runtime.SetCPUProfileRate(10) | |
if err := pprof.StartCPUProfile(&buf); err != nil { | |
panic(err) | |
} | |
p = true | |
} | |
// init | |
x = initMatrix() | |
y = initMatrix() | |
//out = initMatrix() | |
// warmup - excluded from elapsed time | |
// fmt.Println("Warming up...") | |
// for i := 0; i < nwarmup; i++ { | |
// burnCores() | |
// } | |
// fmt.Println("Warmup done.") | |
fmt.Println("Starting benchmark...") | |
var totelapsed, mine, maxe time.Duration | |
for i := 0; i < niter; i++ { | |
t0 := time.Now() | |
burnCores() | |
elapsed := time.Since(t0) | |
if elapsed < mine || mine == 0 { | |
mine = elapsed | |
} | |
if elapsed > maxe || maxe == 0 { | |
maxe = elapsed | |
} | |
totelapsed += elapsed | |
} | |
fmt.Println(fmt.Sprintf("Done. avg: %s, min:%s, max:%s", totelapsed/time.Duration(niter), mine, maxe)) | |
if p { | |
pprof.StopCPUProfile() | |
savePprofFromBuf(&buf, "p2.pb.gz") | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment