Create a script which is forced to execute non-cooperative go-routines one at a time because they are only running on a single LWP
- Try to minimize the number of concurrent threads / LWP via
GOMAXPROCS
. - In theory, the go-routines should execute one after the other since they are non-cooperative?
- Note: On a 16 core box, by default Golang spots the 16 cores available and
GOMAXPROCS
is set to 16.
$ cat blocking.go
package main
import (
"os"
"fmt"
"time"
"strconv"
"syscall"
"runtime"
"runtime/pprof"
"runtime/trace"
)
var timeStart = time.Now()
func f(from string, score int) {
t1 := time.Now()
fmt.Printf("%f %6d=tid f(%s, %6d) // enter; .NumCPU()=%d .GOMAXPROCS(0)=%d .NumGoroutine()=%d\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid(), from, score, runtime.NumCPU(), runtime.GOMAXPROCS(0), runtime.NumGoroutine())
for i := 1; i <= score; i++ {
for z := 1; z <= score; z++ {
}
if 0 == (i % 20000) {
fmt.Printf("%f %6d=tid f(%s, %6d) // i=%d\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid(), from, score, i)
}
}
fmt.Printf("%f %6d=tid f(%s, %6d) // leave in %f; .NumCPU()=%d .GOMAXPROCS(0)=%d .NumGoroutine()=%d\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid(), from, score, time.Now().Sub(t1).Seconds(), runtime.NumCPU(), runtime.GOMAXPROCS(0), runtime.NumGoroutine())
}
func main() {
t1 := time.Now()
fmt.Printf("%f %6d=tid main() // enter; .NumCPU()=%d .GOMAXPROCS(0)=%d .NumGoroutine()=%d\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid(), runtime.NumCPU(), runtime.GOMAXPROCS(0), runtime.NumGoroutine())
if _, exists := os.LookupEnv("USE_PPROF"); exists {
fmt.Printf("%f %6d=tid main() // enabling pprof\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid())
file, _ := os.Create("pprof.out")
defer file.Close()
pprof.StartCPUProfile(file)
defer pprof.StopCPUProfile()
}
if _, exists := os.LookupEnv("USE_TRACE"); exists {
fmt.Printf("%f %6d=tid main() // enabling trace\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid())
file, _ := os.Create("trace.out")
defer file.Close()
trace.Start(file)
defer trace.Stop()
}
fmt.Printf("%f %6d=tid main() // .GOMAXPROCS(0)=%d\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid(), runtime.GOMAXPROCS(0))
goMaxProcsWanted, _ := strconv.Atoi(os.Getenv("GOMAXPROCS_WANTED"))
runtime.GOMAXPROCS(goMaxProcsWanted)
fmt.Printf("%f %6d=tid main() // .GOMAXPROCS(0)=%d\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid(), runtime.GOMAXPROCS(0))
fmt.Printf("%f %6d=tid main() // sleeping\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid())
time.Sleep(3 * time.Second)
fmt.Printf("%f %6d=tid main() // waking\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid())
f("direct", 120000)
go f("foo", 100000)
go f("bar", 80000)
go f("baz", 60000)
go func(msg string) {
fmt.Printf("%f %6d=tid func(%s)\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid(), msg)
}("going without loop")
fmt.Printf("%f %6d=tid main() // type in word + hit return\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid())
var input string
fmt.Scanln(&input)
fmt.Printf("%f %6d=tid main() // input=%s\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid(), input)
fmt.Printf("%f %6d=tid main() // leave in %f; .NumCPU()=%d .GOMAXPROCS(0)=%d .NumGoroutine()=%d\n", time.Now().Sub(timeStart).Seconds(), syscall.Gettid(), time.Now().Sub(t1).Seconds(), runtime.NumCPU(), runtime.GOMAXPROCS(0), runtime.NumGoroutine())
}
// based on https://stackoverflow.com/questions/46628053/how-go-runtime-checks-if-goroutine-is-blocked
- Set
GOMAXPROCS
explicitly to 3, meaning that max 3 go-routines can run concurrently.
$ go build blocking.go && perl -e 'sleep 15; printf qq[word!\n];' | GOMAXPROCS_WANTED=3 ./blocking 2>&1 | perl -lane 'use Time::HiRes; if($_ ne $last){ if($dups > 1){ printf qq[%f x%u duplicates\n], $last_ts, $dups; } printf qq[%f %s\n], Time::HiRes::time(), $_; $dups = 1; }else{ $last_ts = Time::HiRes::time(); $dups ++; } $last=$_;' & sleep 2 ; ps -eLF | egrep "([b]locking|[P]ID)" ; sleep 4 ; ps -eLF | egrep "([b]locking|[P]ID)" ; sleep 3 ; ps -eLF | egrep "([b]locking|[P]ID)"
1622239372.823516 0.000001 6269=tid main() // enter; .NumCPU()=16 .GOMAXPROCS(0)=16 .NumGoroutine()=1
1622239372.823551 0.000073 6269=tid main() // .GOMAXPROCS(0)=16
1622239372.823555 0.000100 6269=tid main() // .GOMAXPROCS(0)=3
1622239372.823564 0.000102 6269=tid main() // sleeping
UID PID PPID LWP C NLWP SZ RSS PSR STIME TTY TIME CMD
simon_h+ 6269 6186 6269 0 5 175850 7028 6 22:02 pts/1 00:00:00 ./blocking
simon_h+ 6269 6186 6271 0 5 175850 7028 1 22:02 pts/1 00:00:00 ./blocking
simon_h+ 6269 6186 6272 0 5 175850 7028 0 22:02 pts/1 00:00:00 ./blocking
simon_h+ 6269 6186 6273 0 5 175850 7028 4 22:02 pts/1 00:00:00 ./blocking
simon_h+ 6269 6186 6274 0 5 175850 7028 13 22:02 pts/1 00:00:00 ./blocking
1622239375.820075 3.000230 6272=tid main() // waking
1622239375.820116 3.000258 6272=tid f(direct, 120000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=1
1622239376.546180 3.726323 6272=tid f(direct, 120000) // i=20000
1622239377.271699 4.451874 6272=tid f(direct, 120000) // i=40000
1622239377.983622 5.163752 6272=tid f(direct, 120000) // i=60000
1622239378.709423 5.889558 6272=tid f(direct, 120000) // i=80000
UID PID PPID LWP C NLWP SZ RSS PSR STIME TTY TIME CMD
simon_h+ 6269 6186 6269 0 5 175850 7028 6 22:02 pts/1 00:00:00 ./blocking
simon_h+ 6269 6186 6271 0 5 175850 7028 5 22:02 pts/1 00:00:00 ./blocking
simon_h+ 6269 6186 6272 50 5 175850 7028 0 22:02 pts/1 00:00:03 ./blocking
simon_h+ 6269 6186 6273 0 5 175850 7028 4 22:02 pts/1 00:00:00 ./blocking
simon_h+ 6269 6186 6274 0 5 175850 7028 13 22:02 pts/1 00:00:00 ./blocking
1622239379.432804 6.612951 6272=tid f(direct, 120000) // i=100000
1622239380.154543 7.334697 6272=tid f(direct, 120000) // i=120000
1622239380.154591 7.334737 6272=tid f(direct, 120000) // leave in 4.334479; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=1
1622239380.154597 7.334769 6272=tid main() // type in word + hit return
1622239380.154689 7.334821 6269=tid f(bar, 80000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=5
1622239380.154722 7.334852 6274=tid f(foo, 100000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=5
1622239380.156020 7.336200 6273=tid func(going without loop)
1622239380.156043 7.336230 6273=tid f(baz, 60000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=4
1622239380.515295 7.695442 6273=tid f(baz, 60000) // i=20000
1622239380.631283 7.811423 6274=tid f(bar, 80000) // i=20000
1622239380.750385 7.930509 6269=tid f(foo, 100000) // i=20000
1622239380.880283 8.059341 6269=tid f(baz, 60000) // i=40000
1622239381.121222 8.301370 6273=tid f(bar, 80000) // i=40000
1622239381.240409 8.420556 6269=tid f(baz, 60000) // i=60000
1622239381.240453 8.420593 6269=tid f(baz, 60000) // leave in 1.084364; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=4
1622239381.357653 8.537805 6273=tid f(foo, 100000) // i=40000
1622239381.597033 8.777167 6273=tid f(bar, 80000) // i=60000
UID PID PPID LWP C NLWP SZ RSS PSR STIME TTY TIME CMD
simon_h+ 6269 6186 6269 12 5 175850 7028 6 22:02 pts/1 00:00:01 ./blocking
simon_h+ 6269 6186 6271 0 5 175850 7028 9 22:02 pts/1 00:00:00 ./blocking
simon_h+ 6269 6186 6272 48 5 175850 7028 0 22:02 pts/1 00:00:04 ./blocking
simon_h+ 6269 6186 6273 19 5 175850 7028 7 22:02 pts/1 00:00:01 ./blocking
simon_h+ 6269 6186 6274 19 5 175850 7028 13 22:02 pts/1 00:00:01 ./blocking
1622239381.965264 9.145415 6273=tid f(foo, 100000) // i=60000
1622239382.092083 9.272224 6273=tid f(bar, 80000) // i=80000
1622239382.092124 9.272259 6273=tid f(bar, 80000) // leave in 1.937439; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=3
1622239382.560368 9.740509 6274=tid f(foo, 100000) // i=80000
1622239383.169018 10.349182 6274=tid f(foo, 100000) // i=100000
1622239383.169062 10.349215 6274=tid f(foo, 100000) // leave in 3.014363; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=2
1622239387.819274 14.999441 6272=tid main() // input=word!
1622239387.819318 14.999461 6272=tid main() // leave in 14.999461; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=1
- Observation: Even though we set
GOMAXPROCS
to 3,ps
always reports 5 LWPs for the process. Why?
- Set
GOMAXPROCS
explicitly to 1, meaning that max 1 go-routine can run at a time. - Observation: Even though we set
GOMAXPROCS
to 3, the lastps
shows 4x threads chewing CPU. Why?
$ go build blocking.go && perl -e 'sleep 15; printf qq[word!\n];' | GOMAXPROCS_WANTED=1 ./blocking 2>&1 | perl -lane 'use Time::HiRes; if($_ ne $last){ if($dups > 1){ printf qq[%f x%u duplicates\n], $last_ts, $dups; } printf qq[%f %s\n], Time::HiRes::time(), $_; $dups = 1; }else{ $last_ts = Time::HiRes::time(); $dups ++; } $last=$_;' & sleep 2 ; ps -eLF | egrep "([b]locking|[P]ID)" ; sleep 4 ; ps -eLF | egrep "([b]locking|[P]ID)" ; sleep 3 ; ps -eLF | egrep "([b]locking|[P]ID)"
1622239518.800515 0.000001 6356=tid main() // enter; .NumCPU()=16 .GOMAXPROCS(0)=16 .NumGoroutine()=1
1622239518.800548 0.000037 6356=tid main() // .GOMAXPROCS(0)=16
1622239518.800557 0.000072 6356=tid main() // .GOMAXPROCS(0)=1
1622239518.800585 0.000076 6356=tid main() // sleeping
UID PID PPID LWP C NLWP SZ RSS PSR STIME TTY TIME CMD
simon_h+ 6356 6283 6356 0 5 175850 5788 9 22:05 pts/1 00:00:00 ./blocking
simon_h+ 6356 6283 6358 0 5 175850 5788 15 22:05 pts/1 00:00:00 ./blocking
simon_h+ 6356 6283 6359 0 5 175850 5788 10 22:05 pts/1 00:00:00 ./blocking
simon_h+ 6356 6283 6360 0 5 175850 5788 11 22:05 pts/1 00:00:00 ./blocking
simon_h+ 6356 6283 6361 0 5 175850 5788 6 22:05 pts/1 00:00:00 ./blocking
1622239521.796662 3.000204 6359=tid main() // waking
1622239521.796703 3.000229 6359=tid f(direct, 120000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=1
1622239522.521093 3.724637 6359=tid f(direct, 120000) // i=20000
1622239523.235430 4.438922 6359=tid f(direct, 120000) // i=40000
1622239523.958976 5.162500 6359=tid f(direct, 120000) // i=60000
1622239524.683917 5.887451 6359=tid f(direct, 120000) // i=80000
UID PID PPID LWP C NLWP SZ RSS PSR STIME TTY TIME CMD
simon_h+ 6356 6283 6356 0 5 175850 5788 11 22:05 pts/1 00:00:00 ./blocking
simon_h+ 6356 6283 6358 0 5 175850 5788 6 22:05 pts/1 00:00:00 ./blocking
simon_h+ 6356 6283 6359 49 5 175850 5788 9 22:05 pts/1 00:00:02 ./blocking
simon_h+ 6356 6283 6360 0 5 175850 5788 11 22:05 pts/1 00:00:00 ./blocking
simon_h+ 6356 6283 6361 0 5 175850 5788 6 22:05 pts/1 00:00:00 ./blocking
1622239525.407411 6.610946 6359=tid f(direct, 120000) // i=100000
1622239526.119309 7.322831 6359=tid f(direct, 120000) // i=120000
1622239526.119351 7.322869 6359=tid f(direct, 120000) // leave in 4.322641; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=1
1622239526.119359 7.322890 6359=tid main() // type in word + hit return
1622239526.130854 7.334395 6356=tid func(going without loop)
1622239526.136426 7.334472 6361=tid f(foo, 100000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
1622239526.146608 7.334523 6361=tid f(bar, 80000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
1622239526.146673 7.334536 6361=tid f(baz, 60000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
1622239527.226219 8.429745 6361=tid f(baz, 60000) // i=20000
1622239527.571185 8.774705 6361=tid f(bar, 80000) // i=20000
UID PID PPID LWP C NLWP SZ RSS PSR STIME TTY TIME CMD
simon_h+ 6356 6283 6356 0 5 175850 5788 7 22:05 pts/1 00:00:00 ./blocking
simon_h+ 6356 6283 6358 0 5 175850 5788 12 22:05 pts/1 00:00:00 ./blocking
simon_h+ 6356 6283 6359 48 5 175850 5788 9 22:05 pts/1 00:00:04 ./blocking
simon_h+ 6356 6283 6360 0 5 175850 5788 11 22:05 pts/1 00:00:00 ./blocking
simon_h+ 6356 6283 6361 18 5 175850 5788 6 22:05 pts/1 00:00:01 ./blocking
1622239527.908274 9.111738 6361=tid f(foo, 100000) // i=20000
1622239528.311557 9.515085 6361=tid f(baz, 60000) // i=40000
1622239529.021321 10.224857 6361=tid f(bar, 80000) // i=40000
1622239529.398300 10.601836 6361=tid f(baz, 60000) // i=60000
1622239529.398340 10.601869 6361=tid f(baz, 60000) // leave in 3.267334; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
1622239529.608226 10.811770 6361=tid f(foo, 100000) // i=40000
1622239530.115676 11.319215 6361=tid f(bar, 80000) // i=60000
1622239530.796553 12.000074 6361=tid f(foo, 100000) // i=60000
1622239531.101278 12.304796 6361=tid f(bar, 80000) // i=80000
1622239531.101320 12.304834 6361=tid f(bar, 80000) // leave in 4.970311; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=3
1622239531.540955 12.744472 6361=tid f(foo, 100000) // i=80000
1622239532.152082 13.355598 6361=tid f(foo, 100000) // i=100000
1622239532.152125 13.355641 6361=tid f(foo, 100000) // leave in 6.021169; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
1622239533.796099 14.999647 6359=tid main() // input=word!
1622239533.796149 14.999667 6359=tid main() // leave in 14.999667; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=1
- Observation: Even though we set
GOMAXPROCS
to 1,ps
always reports 5 LWPs for the process. Why? - Observation: Even though we set
GOMAXPROCS
to 1, the lastps
shows 2x threads chewing CPU. Why?- Note: Total CPU usage does not go above 100% though. Maybe Golang switches LWPs but does not use them concurrently?
- Observation: How do the non-cooperative
foo
,bar
, andbaz
run 'concurrently' ifGOMAXPROCS
is 1? - It turns out that Go version 1.14 introduced asynchronous preemption::
Goroutines are now asynchronously preemptible. As a result, loops without function calls no longer potentially deadlock the scheduler or significantly delay garbage collection. This is supported on all platforms except `windows/arm`, `darwin/arm`, `js/wasm`, and `plan9/*`.
- The original Golang asynchronous preemption proposal is described here [1]. And talked about here [2].
[1] https://go.googlesource.com/proposal/+/master/design/24543-non-cooperative-preemption.md [2] https://stackoverflow.com/questions/64113394/at-which-point-a-goroutine-can-yield
-
Set
GOMAXPROCS
explicitly to 1, meaning that max 1 go-routine can run at a time. -
Golang uses
SIGURG
is used to preempt functions at various points in execution, and these signals can be viewed by running the above code withstrace
:
$ go build blocking.go && perl -e 'sleep 15; printf qq[word!\n];' | GOMAXPROCS_WANTED=1 strace -f -e 'trace=!write,gettid,getpid,futex,tgkill,arch_prctl,sigaltstack,rt_sigprocmask,rt_sigaction,rt_sigreturn,nanosleep,epoll_pwait' ./blocking 2>&1 | perl -lane 'use Time::HiRes; if($_ ne $last){ if($dups > 1){ printf qq[%f x%u duplicates\n], $last_ts, $dups; } printf qq[%f %s\n], Time::HiRes::time(), $_; $dups = 1; }else{ $last_ts = Time::HiRes::time(); $dups ++; } $last=$_;' & sleep 2 ; ps -eLF | egrep "([b]locking|[P]ID)" ; sleep 4 ; ps -eLF | egrep "([b]locking|[P]ID)" ; sleep 3 ; ps -eLF | egrep "([b]locking|[P]ID)"
1622239615.195494 execve("./blocking", ["./blocking"], 0x7ffcd316c658 /* 24 vars */) = 0
1622239615.195529 sched_getaffinity(0, 8192, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) = 64
1622239615.195544 openat(AT_FDCWD, "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", O_RDONLY) = 3
1622239615.195555 read(3, "2097152\n", 20) = 8
1622239615.195560 close(3) = 0
1622239615.195568 uname({sysname="Linux", nodename="benchnet-dev-002", ...}) = 0
1622239615.195575 mmap(NULL, 262144, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f462f8000
1622239615.195581 mmap(NULL, 131072, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f462d8000
1622239615.195587 mmap(NULL, 1048576, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f461d8000
1622239615.195593 mmap(NULL, 8388608, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f459d8000
1622239615.195599 mmap(NULL, 67108864, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f419d8000
1622239615.195605 mmap(NULL, 536870912, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f219d8000
1622239615.195611 mmap(0xc000000000, 67108864, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xc000000000
1622239615.195617 mmap(0xc000000000, 67108864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0xc000000000
1622239615.195624 mmap(NULL, 33554432, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f1f9d8000
1622239615.195633 mmap(NULL, 2165768, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f1f7c7000
1622239615.195642 mmap(0x7f6f462d8000, 131072, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f6f462d8000
1622239615.195651 mmap(0x7f6f46258000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f6f46258000
1622239615.195660 mmap(0x7f6f45dde000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f6f45dde000
1622239615.195670 mmap(0x7f6f43a08000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f6f43a08000
1622239615.195680 mmap(0x7f6f31b58000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f6f31b58000
1622239615.195689 mmap(NULL, 1048576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f1f6c7000
1622239615.195699 mmap(NULL, 65536, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f1f6b7000
1622239615.195708 mmap(NULL, 65536, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f1f6a7000
1622239615.195717 clone(strace: Process 6459 attached
1622239615.195725 child_stack=0xc000062000, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM) = 6459
1622239615.195734 [pid 6458] clone(strace: Process 6460 attached
1622239615.195742 child_stack=0xc000064000, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM) = 6460
1622239615.195750 [pid 6458] clone(child_stack=0xc00005e000, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM) = 6461
1622239615.195759 strace: Process 6461 attached
1622239615.195770 [pid 6460] mmap(NULL, 262144, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6f1f667000
1622239615.195778 [pid 6460] clone(strace: Process 6462 attached
1622239615.195786 child_stack=0xc000092000, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM) = 6462
1622239615.195795 [pid 6458] clone(child_stack=0xc000060000, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM) = 6463
1622239615.195802 strace: Process 6463 attached
1622239615.195897 [pid 6458] readlinkat(AT_FDCWD, "/proc/self/exe", "/home/simon_hardy_francis_dapper"..., 128) = 72
1622239615.195925 [pid 6458] fcntl(0, F_GETFL) = 0 (flags O_RDONLY)
1622239615.195986 [pid 6458] fcntl(1, F_GETFL) = 0x1 (flags O_WRONLY)
1622239615.196013 [pid 6458] fcntl(2, F_GETFL) = 0x1 (flags O_WRONLY)
1622239615.196169 0.000001 6458=tid main() // enter; .NumCPU()=16 .GOMAXPROCS(0)=16 .NumGoroutine()=1
1622239615.196212 0.000090 6458=tid main() // .GOMAXPROCS(0)=16
1622239615.196291 [pid 6458] mmap(NULL, 262144, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0 <unfinished ...>
1622239615.196326 [pid 6459] sched_yield( <unfinished ...>
1622239615.196336 [pid 6458] <... mmap resumed> ) = 0x7f6f1f627000
1622239615.196347 [pid 6459] <... sched_yield resumed> ) = 0
1622239615.196523 0.000400 6458=tid main() // .GOMAXPROCS(0)=1
1622239615.196623 0.000501 6462=tid main() // sleeping
1622239615.196664 [pid 6462] epoll_create1(EPOLL_CLOEXEC) = 3
1622239615.196705 [pid 6462] pipe2([4, 5], O_NONBLOCK|O_CLOEXEC) = 0
1622239615.196757 [pid 6462] epoll_ctl(3, EPOLL_CTL_ADD, 4, {EPOLLIN, {u32=5962384, u64=5962384}}) = 0
UID PID PPID LWP C NLWP SZ RSS PSR STIME TTY TIME CMD
simon_h+ 6454 6370 6454 0 1 1565 3320 10 22:06 pts/1 00:00:00 strace -f -e trace=!write,gettid,getpid,futex,tgkill,arch_prctl,sigaltstack,rt_sigprocmask,rt_sigaction,rt_sigreturn,nanosleep,epoll_pwait ./blocking
simon_h+ 6458 6454 6458 0 6 175850 5852 6 22:06 pts/1 00:00:00 ./blocking
simon_h+ 6458 6454 6459 0 6 175850 5852 9 22:06 pts/1 00:00:00 ./blocking
simon_h+ 6458 6454 6460 0 6 175850 5852 7 22:06 pts/1 00:00:00 ./blocking
simon_h+ 6458 6454 6461 0 6 175850 5852 0 22:06 pts/1 00:00:00 ./blocking
simon_h+ 6458 6454 6462 0 6 175850 5852 3 22:06 pts/1 00:00:00 ./blocking
simon_h+ 6458 6454 6463 0 6 175850 5852 5 22:06 pts/1 00:00:00 ./blocking
1622239618.197036 3.000820 6458=tid main() // waking
1622239618.197123 3.000954 6463=tid f(direct, 120000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=1
1622239618.208377 [pid 6463] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239618.919412 x36 duplicates
1622239618.922672 3.726441 6463=tid f(direct, 120000) // i=20000
1622239618.939843 [pid 6463] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239619.635284 x35 duplicates
1622239619.636707 4.440469 6463=tid f(direct, 120000) // i=40000
1622239619.655789 [pid 6463] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239620.351033 x35 duplicates
1622239620.359982 5.163726 6463=tid f(direct, 120000) // i=60000
1622239620.371477 [pid 6463] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239621.067251 x35 duplicates
1622239621.084177 5.887915 6463=tid f(direct, 120000) // i=80000
1622239621.087738 [pid 6463] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
UID PID PPID LWP C NLWP SZ RSS PSR STIME TTY TIME CMD
simon_h+ 6454 6370 6454 0 1 1565 3320 3 22:06 pts/1 00:00:00 strace -f -e trace=!write,gettid,getpid,futex,tgkill,arch_prctl,sigaltstack,rt_sigprocmask,rt_sigaction,rt_sigreturn,nanosleep,epoll_pwait ./blocking
simon_h+ 6458 6454 6458 0 6 175850 5852 6 22:06 pts/1 00:00:00 ./blocking
simon_h+ 6458 6454 6459 0 6 175850 5852 0 22:06 pts/1 00:00:00 ./blocking
simon_h+ 6458 6454 6460 0 6 175850 5852 7 22:06 pts/1 00:00:00 ./blocking
simon_h+ 6458 6454 6461 0 6 175850 5852 0 22:06 pts/1 00:00:00 ./blocking
simon_h+ 6458 6454 6462 0 6 175850 5852 3 22:06 pts/1 00:00:00 ./blocking
simon_h+ 6458 6454 6463 49 6 175850 5852 5 22:06 pts/1 00:00:02 ./blocking
1622239621.803211 x36 duplicates
1622239621.807145 6.610914 6463=tid f(direct, 120000) // i=100000
1622239621.823699 [pid 6463] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239622.520434 x35 duplicates
1622239622.522924 7.326583 6463=tid f(direct, 120000) // i=120000
1622239622.522990 7.326868 6463=tid f(direct, 120000) // leave in 4.325926; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=1
1622239622.523041 7.326937 6463=tid main() // type in word + hit return
1622239622.540928 [pid 6463] read(0, 7.344752 6462=tid func(going without loop)
1622239622.540969 7.344825 6462=tid f(foo, 100000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
1622239622.562307 <unfinished ...>
1622239622.562346 [pid 6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239622.562451 7.366261 6462=tid f(bar, 80000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
1622239622.577875 [pid 6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239622.578027 7.381881 6462=tid f(baz, 60000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
1622239622.598332 [pid 6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239623.643098 x52 duplicates
1622239623.659040 8.462803 6462=tid f(baz, 60000) // i=20000
1622239623.663517 [pid 6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239623.991278 x17 duplicates
1622239624.007362 8.811113 6462=tid f(bar, 80000) // i=20000
1622239624.011778 [pid 6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
UID PID PPID LWP C NLWP SZ RSS PSR STIME TTY TIME CMD
simon_h+ 6454 6370 6454 0 1 1565 3320 5 22:06 pts/1 00:00:00 strace -f -e trace=!write,gettid,getpid,futex,tgkill,arch_prctl,sigaltstack,rt_sigprocmask,rt_sigaction,rt_sigreturn,nanosleep,epoll_pwait ./blocking
simon_h+ 6458 6454 6458 0 6 175850 5852 6 22:06 pts/1 00:00:00 ./blocking
simon_h+ 6458 6454 6459 0 6 175850 5852 4 22:06 pts/1 00:00:00 ./blocking
simon_h+ 6458 6454 6460 0 6 175850 5852 7 22:06 pts/1 00:00:00 ./blocking
simon_h+ 6458 6454 6461 0 6 175850 5852 0 22:06 pts/1 00:00:00 ./blocking
simon_h+ 6458 6454 6462 18 6 175850 5852 2 22:06 pts/1 00:00:01 ./blocking
simon_h+ 6458 6454 6463 47 6 175850 5852 8 22:06 pts/1 00:00:04 ./blocking
1622239624.278122 x14 duplicates
1622239624.284524 9.088293 6462=tid f(foo, 100000) // i=20000
1622239624.298615 [pid 6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239624.769219 x24 duplicates
1622239624.781013 9.584756 6462=tid f(baz, 60000) // i=40000
1622239624.789645 [pid 6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239625.485786 x35 duplicates
1622239625.494236 10.297982 6462=tid f(bar, 80000) // i=40000
1622239625.506155 [pid 6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239625.894760 x20 duplicates
1622239625.898915 10.702669 6462=tid f(baz, 60000) // i=60000
1622239625.898934 10.702797 6462=tid f(baz, 60000) // leave in 3.320944; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
1622239625.915290 [pid 6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239625.976798 x4 duplicates
1622239625.978271 10.782071 6462=tid f(foo, 100000) // i=40000
1622239625.997289 [pid 6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239626.570206 x29 duplicates
1622239626.588907 11.392650 6462=tid f(bar, 80000) // i=60000
1622239626.590635 [pid 6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239627.163053 x29 duplicates
1622239627.173564 11.977313 6462=tid f(foo, 100000) // i=60000
1622239627.183497 [pid 6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239627.552142 x19 duplicates
1622239627.562964 12.366691 6462=tid f(bar, 80000) // i=80000
1622239627.562975 12.366820 6462=tid f(bar, 80000) // leave in 5.000588; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=3
1622239627.582762 [pid 6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239627.951767 x19 duplicates
1622239627.968117 12.771858 6462=tid f(foo, 100000) // i=80000
1622239627.972193 [pid 6462] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=6458, si_uid=1054159318} ---
1622239628.566548 x30 duplicates
1622239628.577700 13.381432 6462=tid f(foo, 100000) // i=100000
1622239628.577750 13.381596 6462=tid f(foo, 100000) // leave in 6.036797; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
1622239630.189976 [pid 6463] <... read resumed> "w", 1) = 1
1622239630.190178 [pid 6463] read(0, "o", 1) = 1
1622239630.190212 [pid 6463] read(0, "r", 1) = 1
1622239630.190339 [pid 6462] read(0, "d", 1) = 1
1622239630.190381 [pid 6462] read(0, "!", 1) = 1
1622239630.190423 [pid 6462] read(0, "\n", 1) = 1
1622239630.190500 14.994351 6462=tid main() // input=word!
1622239630.190565 14.994430 6462=tid main() // leave in 14.994457; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=1
1622239630.190665 [pid 6462] exit_group(0) = ?
1622239630.190714 [pid 6463] +++ exited with 0 +++
1622239630.190763 [pid 6462] +++ exited with 0 +++
1622239630.190846 [pid 6461] +++ exited with 0 +++
1622239630.190853 [pid 6460] +++ exited with 0 +++
1622239630.190982 [pid 6459] +++ exited with 0 +++
1622239630.190991 +++ exited with 0 +++
- Re-running with
GODEBUG=asyncpreemptoff=1
[1] to switch off async preemption:
$ go build blocking.go && perl -e 'sleep 15; printf qq[word!\n];' | GODEBUG=asyncpreemptoff=1 GOMAXPROCS_WANTED=1 strace -f -e 'trace=!write,gettid,getpid,futex,tgkill,arch_prctl,sigaltstack,rt_sigprocmask,rt_sigaction,rt_sigreturn,nanosleep,epoll_pwait' ./blocking 2>&1 | perl -lane 'use Time::HiRes; if($_ ne $last){ if($dups > 1){ printf qq[%f x%u duplicates\n], $last_ts, $dups; } printf qq[%f %s\n], Time::HiRes::time(), $_; $dups = 1; }else{ $last_ts = Time::HiRes::time(); $dups ++; } $last=$_;' & sleep 2 ; ps -eLF | egrep "([b]locking|[P]ID)" ; sleep 4 ; ps -eLF | egrep "([b]locking|[P]ID)" ; sleep 3 ; ps -eLF | egrep "([b]locking|[P]ID)"
1622239745.443929 execve("./blocking", ["./blocking"], 0x7fffc7bd8f28 /* 25 vars */) = 0
1622239745.443965 sched_getaffinity(0, 8192, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) = 64
1622239745.443970 openat(AT_FDCWD, "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", O_RDONLY) = 3
1622239745.443973 read(3, "2097152\n", 20) = 8
1622239745.443977 close(3) = 0
1622239745.443986 uname({sysname="Linux", nodename="benchnet-dev-002", ...}) = 0
1622239745.443991 mmap(NULL, 262144, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4eb02c000
1622239745.444003 mmap(NULL, 131072, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4eb00c000
1622239745.444034 mmap(NULL, 1048576, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4eaf0c000
1622239745.444044 mmap(NULL, 8388608, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4ea70c000
1622239745.444053 mmap(NULL, 67108864, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4e670c000
1622239745.444063 mmap(NULL, 536870912, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4c670c000
1622239745.444070 mmap(0xc000000000, 67108864, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xc000000000
1622239745.444078 mmap(0xc000000000, 67108864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0xc000000000
1622239745.444087 mmap(NULL, 33554432, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4c470c000
1622239745.444095 mmap(NULL, 2165768, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4c44fb000
1622239745.444103 mmap(0x7fa4eb00c000, 131072, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fa4eb00c000
1622239745.444113 mmap(0x7fa4eaf8c000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fa4eaf8c000
1622239745.444123 mmap(0x7fa4eab12000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fa4eab12000
1622239745.444133 mmap(0x7fa4e873c000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fa4e873c000
1622239745.444143 mmap(0x7fa4d688c000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fa4d688c000
1622239745.444152 mmap(NULL, 1048576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4c43fb000
1622239745.444162 mmap(NULL, 65536, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4c43eb000
1622239745.444172 mmap(NULL, 65536, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4c43db000
1622239745.444185 clone(strace: Process 6553 attached
1622239745.444200 child_stack=0xc000064000, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM) = 6553
1622239745.444263 [pid 6552] clone(child_stack=0xc000066000, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM) = 6554
1622239745.444276 strace: Process 6554 attached
1622239745.444382 [pid 6552] clone(strace: Process 6555 attached
1622239745.444404 child_stack=0xc000060000, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM) = 6555
1622239745.444533 [pid 6554] mmap(NULL, 262144, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4c439b000
1622239745.444637 [pid 6554] clone( <unfinished ...>
1622239745.444663 [pid 6552] readlinkat(AT_FDCWD, "/proc/self/exe", <unfinished ...>
1622239745.444675 [pid 6554] <... clone resumed> child_stack=0xc000092000, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM) = 6556
1622239745.444686 strace: Process 6556 attached
1622239745.444705 [pid 6552] <... readlinkat resumed> "/home/simon_hardy_francis_dapper"..., 128) = 72
1622239745.444726 [pid 6552] fcntl(0, F_GETFL) = 0 (flags O_RDONLY)
1622239745.444757 [pid 6552] mmap(NULL, 262144, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4c435b000
1622239745.444784 [pid 6552] fcntl(1, F_GETFL) = 0x1 (flags O_WRONLY)
1622239745.444803 [pid 6552] fcntl(2, F_GETFL) = 0x1 (flags O_WRONLY)
1622239745.444958 0.000001 6552=tid main() // enter; .NumCPU()=16 .GOMAXPROCS(0)=16 .NumGoroutine()=1
1622239745.444994 0.000079 6552=tid main() // .GOMAXPROCS(0)=16
1622239745.445106 0.000162 6552=tid main() // .GOMAXPROCS(0)=1
1622239745.445147 0.000234 6552=tid main() // sleeping
1622239745.445186 [pid 6552] epoll_create1(EPOLL_CLOEXEC) = 3
1622239745.445234 [pid 6552] pipe2([4, 5], O_NONBLOCK|O_CLOEXEC) = 0
1622239745.445254 [pid 6552] epoll_ctl(3, EPOLL_CTL_ADD, 4, {EPOLLIN, {u32=5962384, u64=5962384}}) = 0
UID PID PPID LWP C NLWP SZ RSS PSR STIME TTY TIME CMD
simon_h+ 6548 6475 6548 0 1 1565 3428 6 22:09 pts/1 00:00:00 strace -f -e trace=!write,gettid,getpid,futex,tgkill,arch_prctl,sigaltstack,rt_sigprocmask,rt_sigaction,rt_sigreturn,nanosleep,epoll_pwait ./blocking
simon_h+ 6552 6548 6552 0 5 175850 7028 7 22:09 pts/1 00:00:00 ./blocking
simon_h+ 6552 6548 6553 0 5 175850 7028 1 22:09 pts/1 00:00:00 ./blocking
simon_h+ 6552 6548 6554 0 5 175850 7028 4 22:09 pts/1 00:00:00 ./blocking
simon_h+ 6552 6548 6555 0 5 175850 7028 11 22:09 pts/1 00:00:00 ./blocking
simon_h+ 6552 6548 6556 0 5 175850 7028 8 22:09 pts/1 00:00:00 ./blocking
1622239748.445595 3.000587 6554=tid main() // waking
1622239748.445705 3.000727 6556=tid f(direct, 120000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=1
1622239749.171289 3.726181 6556=tid f(direct, 120000) // i=20000
1622239749.894626 4.449571 6556=tid f(direct, 120000) // i=40000
1622239750.620029 5.175008 6556=tid f(direct, 120000) // i=60000
1622239751.334014 5.888957 6556=tid f(direct, 120000) // i=80000
UID PID PPID LWP C NLWP SZ RSS PSR STIME TTY TIME CMD
simon_h+ 6548 6475 6548 0 1 1565 3428 14 22:09 pts/1 00:00:00 strace -f -e trace=!write,gettid,getpid,futex,tgkill,arch_prctl,sigaltstack,rt_sigprocmask,rt_sigaction,rt_sigreturn,nanosleep,epoll_pwait ./blocking
simon_h+ 6552 6548 6552 0 5 175850 7028 7 22:09 pts/1 00:00:00 ./blocking
simon_h+ 6552 6548 6553 0 5 175850 7028 1 22:09 pts/1 00:00:00 ./blocking
simon_h+ 6552 6548 6554 0 5 175850 7028 4 22:09 pts/1 00:00:00 ./blocking
simon_h+ 6552 6548 6555 0 5 175850 7028 11 22:09 pts/1 00:00:00 ./blocking
simon_h+ 6552 6548 6556 49 5 175850 7028 8 22:09 pts/1 00:00:02 ./blocking
1622239752.058552 6.613503 6556=tid f(direct, 120000) // i=100000
1622239752.780142 7.335125 6556=tid f(direct, 120000) // i=120000
1622239752.780180 7.335231 6556=tid f(direct, 120000) // leave in 4.334531; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=1
1622239752.780315 7.335344 6556=tid main() // type in word + hit return
1622239752.799064 [pid 6556] read(0, 7.354051 6552=tid func(going without loop)
1622239752.799373 7.354321 6554=tid f(foo, 100000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
1622239753.410394 7.354389 6554=tid f(bar, 80000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
1622239753.885670 7.354435 6554=tid f(baz, 60000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
1622239754.250720 8.805672 6554=tid f(foo, 100000) // i=20000
UID PID PPID LWP C NLWP SZ RSS PSR STIME TTY TIME CMD
simon_h+ 6548 6475 6548 0 1 1565 3428 8 22:09 pts/1 00:00:00 strace -f -e trace=!write,gettid,getpid,futex,tgkill,arch_prctl,sigaltstack,rt_sigprocmask,rt_sigaction,rt_sigreturn,nanosleep,epoll_pwait ./blocking
simon_h+ 6552 6548 6552 0 5 175850 7028 7 22:09 pts/1 00:00:00 ./blocking
simon_h+ 6552 6548 6553 0 5 175850 7028 5 22:09 pts/1 00:00:00 ./blocking
simon_h+ 6552 6548 6554 18 5 175850 7028 4 22:09 pts/1 00:00:01 ./blocking
simon_h+ 6552 6548 6555 0 5 175850 7028 11 22:09 pts/1 00:00:00 ./blocking
simon_h+ 6552 6548 6556 48 5 175850 7028 8 22:09 pts/1 00:00:04 ./blocking
simon_hardy_francis_dapperlabs_c@benchnet-dev-002:~/20210526-golang-examples$ 1622239754.845221 9.400169 6554=tid f(bar, 80000) // i=20000
1622239755.331816 9.886732 6554=tid f(baz, 60000) // i=20000
1622239755.689428 10.244423 6554=tid f(foo, 100000) // i=40000
1622239756.291424 10.846358 6554=tid f(bar, 80000) // i=40000
1622239756.779627 11.334555 6554=tid f(baz, 60000) // i=40000
1622239757.145514 11.700461 6554=tid f(foo, 100000) // i=60000
1622239757.739752 12.294703 6554=tid f(bar, 80000) // i=60000
1622239758.225962 12.780897 6554=tid f(baz, 60000) // i=60000
1622239758.226009 12.781046 6554=tid f(baz, 60000) // leave in 5.426639; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
1622239758.226070 12.781111 6554=tid f(foo, 100000) // i=80000
1622239758.828281 13.383283 6554=tid f(bar, 80000) // i=80000
1622239758.828323 13.383381 6554=tid f(bar, 80000) // leave in 6.029004; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=3
1622239758.828331 13.383416 6554=tid f(foo, 100000) // i=100000
1622239758.828372 13.383450 6554=tid f(foo, 100000) // leave in 6.029141; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
1622239760.439469 "w", 1) = 1
1622239760.439634 [pid 6556] read(0, "o", 1) = 1
1622239760.439667 [pid 6556] read(0, "r", 1) = 1
1622239760.439704 [pid 6556] read(0, "d", 1) = 1
1622239760.439790 [pid 6556] read(0, "!", 1) = 1
1622239760.439846 [pid 6556] read(0, "\n", 1) = 1
1622239760.439933 14.994994 6556=tid main() // input=word!
1622239760.439995 14.995058 6556=tid main() // leave in 14.995086; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=1
1622239760.440053 [pid 6556] exit_group(0) = ?
1622239760.440111 [pid 6556] +++ exited with 0 +++
1622239760.440121 [pid 6555] +++ exited with 0 +++
1622239760.440131 [pid 6554] +++ exited with 0 +++
1622239760.440340 [pid 6553] +++ exited with 0 +++
1622239760.440354 +++ exited with 0 +++
- Observation: As expected,
strace
no longer shows theSIGURG
signals used for async preemption. - Observation: Even without async preemption then go-routines still appear to context switch. Presumably this happens due to the
fmt.Printf()
?
[1] https://medium.com/a-journey-with-go/go-asynchronous-preemption-b5194227371c
# Experiment 1: Run on Linux 16 core box with `GOMAXPROCS=3`
1622239381.240453 8.420593 6269=tid f(baz, 60000) // leave in 1.084364; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=4
1622239382.092124 9.272259 6273=tid f(bar, 80000) // leave in 1.937439; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=3
1622239383.169062 10.349215 6274=tid f(foo, 100000) // leave in 3.014363; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=2
# Experiment 2: Run on Linux 16 core box with `GOMAXPROCS=1`
1622239529.398340 10.601869 6361=tid f(baz, 60000) // leave in 3.267334; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
1622239531.101320 12.304834 6361=tid f(bar, 80000) // leave in 4.970311; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=3
1622239532.152125 13.355641 6361=tid f(foo, 100000) // leave in 6.021169; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
# Experiment 3: Run on Linux 16 core box with `GOMAXPROCS=1` & `strace`
1622239625.898934 10.702797 6462=tid f(baz, 60000) // leave in 3.320944; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
1622239627.562975 12.366820 6462=tid f(bar, 80000) // leave in 5.000588; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=3
1622239628.577750 13.381596 6462=tid f(foo, 100000) // leave in 6.036797; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
# Experiment 4: Run on Linux 16 core box with `GOMAXPROCS=1` & `strace` & `GODEBUG=asyncpreemptoff=1`
1622239758.226009 12.781046 6554=tid f(baz, 60000) // leave in 5.426639; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
1622239758.828323 13.383381 6554=tid f(bar, 80000) // leave in 6.029004; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=3
1622239758.828372 13.383450 6554=tid f(foo, 100000) // leave in 6.029141; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
- Observation: Experiments 2 & 3 have the same timings, and are the same experiments except for
strace
. - Observation: Experiement 1 shows the fastest execution time for each go-routine because they each have their own LWP to run on, with on switching to other go-routines or async preemption.
- Observation: Experiment 1 shows the go-routines running in parallel, but their total time is approx 1 + 2 + 3 = 6 seconds, which is -- as expected -- the time of the longest Go-routine if
GOMAXPROCS=1
. - Observation: Likewise, Experiment 1 is -- as expected -- 3 seconds faster finishing the slowest go-routine due to running the go-routines in parallel on different LWPs.
- Observation: Experiment 4 shows the most distorted / misleading wall-clock times -- only ~ half a second between the fastest and slowest go-routines -- and is not proportionate to the workload of each go-routine.
- Observation: Experiments 2 & 3 show misleading wall-clock times but due to async preemption the times are at least proportionate to the workload of each go-routine.
- Question: The experiments measure the elapsed wall-clock time which ends up different depending upon how much switching goes on between go-routines on a particular LWP. For experiments 2 to 4, how can the code above be modified to record additionally the wall-clock time spent for the particular function excluding time used in other functions (not sure if a facility like this exists for Golang?), e.g. the following desired output if the code changes existed:
# Experiment 3: Run on Linux 16 core box with `GOMAXPROCS=1` & `strace`
1622239625.898934 10.702797 6462=tid f(baz, 60000) // leave in 3.320944, 1.084364 used; ...
1622239627.562975 12.366820 6462=tid f(bar, 80000) // leave in 5.000588, 1.937439 used;
1622239628.577750 13.381596 6462=tid f(foo, 100000) // leave in 6.036797, 3.014363 used;
- Searching Google for "golang measure execution time" only seems to dish up link showing the total wall-clock time including all other go-routines, but not the wall-clock time excluding all other go-routines: [1] [2] [3] [4] [5] [6] [7] [8]
- Not sure what to Google for... or if a solution exists.
[1] https://stackoverflow.com/questions/45766572/is-there-an-efficient-way-to-calculate-execution-time-in-golang/45766707 [2] https://gist.github.com/denisbrodbeck/6f5813681dfe049b1e5392f978738833 [3] https://flaviocopes.com/golang-measure-time/ [4] https://www.admfactory.com/how-to-measure-execution-time-in-golang/ [5] https://programming.guide/go/measure-execution-time.html [6] https://golangcode.com/function-execution-time/ [7] https://www.socketloop.com/tutorials/golang-measure-execution-time-for-a-function [8] https://medium.com/swlh/easy-guide-to-latency-measurement-in-golang-38c3297ebbd2 [9] https://forum.golangbridge.org/t/measure-execution-time-of-a-fast-operation/16313
- Re-run experiment 1 but enable the profiler resulting in
pprof.out
:
$ go build blocking.go && perl -e 'sleep 15; printf qq[word!\n];' | USE_PPROF=1 GOMAXPROCS_WANTED=3 ./blocking ; wc --bytes pprof.out
0.000001 23998=tid main() // enter; .NumCPU()=16 .GOMAXPROCS(0)=16 .NumGoroutine()=1
0.000052 23998=tid main() // enabling pprof
0.000559 23998=tid main() // .GOMAXPROCS(0)=16
0.000607 23998=tid main() // .GOMAXPROCS(0)=3
0.000613 23998=tid main() // sleeping
3.000816 23998=tid main() // waking
3.000859 23998=tid f(direct, 120000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=2
3.713033 23998=tid f(direct, 120000) // i=20000
4.425492 23998=tid f(direct, 120000) // i=40000
5.136554 23998=tid f(direct, 120000) // i=60000
5.847794 23998=tid f(direct, 120000) // i=80000
6.559673 23998=tid f(direct, 120000) // i=100000
7.270959 23998=tid f(direct, 120000) // i=120000
7.271007 23998=tid f(direct, 120000) // leave in 4.270148; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=2
7.271036 23998=tid main() // type in word + hit return
7.271079 24000=tid f(bar, 80000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=6
7.271113 24001=tid f(foo, 100000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=6
7.277925 24003=tid func(going without loop)
7.277969 24003=tid f(baz, 60000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=5
7.633914 24000=tid f(baz, 60000) // i=20000
7.745805 24000=tid f(bar, 80000) // i=20000
7.864593 24000=tid f(foo, 100000) // i=20000
7.990347 24003=tid f(baz, 60000) // i=40000
8.220914 24003=tid f(bar, 80000) // i=40000
8.346848 24003=tid f(baz, 60000) // i=60000
8.346876 24003=tid f(baz, 60000) // leave in 1.068908; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=5
8.458698 24000=tid f(foo, 100000) // i=40000
8.696054 24000=tid f(bar, 80000) // i=60000
9.052872 24000=tid f(foo, 100000) // i=60000
9.171187 24001=tid f(bar, 80000) // i=80000
9.171234 24001=tid f(bar, 80000) // leave in 1.900155; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=4
9.645728 24000=tid f(foo, 100000) // i=80000
10.238577 24000=tid f(foo, 100000) // i=100000
10.238622 24000=tid f(foo, 100000) // leave in 2.967510; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=3
15.000055 23998=tid main() // input=word!
15.000091 23998=tid main() // leave in 15.000090; .NumCPU()=16 .GOMAXPROCS(0)=3 .NumGoroutine()=2
450 pprof.out
$ go tool pprof -top pprof.out
File: blocking
Type: cpu
Duration: 15s, Total samples = 9.69s (64.60%)
Showing nodes accounting for 9.67s, 99.79% of 9.69s total
Dropped 1 node (cum <= 0.05s)
flat flat% sum% cum cum%
9.67s 99.79% 99.79% 9.69s 100% main.f
0 0% 99.79% 4.27s 44.07% main.main
0 0% 99.79% 4.27s 44.07% runtime.main
-
Observation: The flat
9.67s
reflects the total accumulated time for functionf()
. -
Re-run experiment 2 but enable the profiler resulting in
pprof.out
:
$ go build blocking.go && perl -e 'sleep 15; printf qq[word!\n];' | USE_PPROF=1 GOMAXPROCS_WANTED=1 ./blocking ; wc --bytes pprof.out
0.000018 23806=tid main() // enter; .NumCPU()=16 .GOMAXPROCS(0)=16 .NumGoroutine()=1
0.000220 23806=tid main() // enabling pprof
0.000342 23806=tid main() // .GOMAXPROCS(0)=16
0.000429 23806=tid main() // .GOMAXPROCS(0)=1
0.000437 23806=tid main() // sleeping
3.000628 23808=tid main() // waking
3.000666 23808=tid f(direct, 120000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
3.713037 23808=tid f(direct, 120000) // i=20000
4.424750 23808=tid f(direct, 120000) // i=40000
5.137150 23808=tid f(direct, 120000) // i=60000
5.849831 23808=tid f(direct, 120000) // i=80000
6.561112 23808=tid f(direct, 120000) // i=100000
7.273245 23808=tid f(direct, 120000) // i=120000
7.273292 23808=tid f(direct, 120000) // leave in 4.272627; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
7.273310 23808=tid main() // type in word + hit return
7.286434 23809=tid func(going without loop)
7.286485 23809=tid f(foo, 100000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
7.301474 23809=tid f(bar, 80000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
7.321713 23809=tid f(baz, 60000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
8.386264 23809=tid f(baz, 60000) // i=20000
8.706789 23809=tid f(bar, 80000) // i=20000
9.032683 23809=tid f(foo, 100000) // i=20000
9.490639 23809=tid f(baz, 60000) // i=40000
10.091414 23809=tid f(bar, 80000) // i=40000
10.554625 23809=tid f(baz, 60000) // i=60000
10.554671 23809=tid f(baz, 60000) // leave in 3.232958; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
10.708127 23809=tid f(foo, 100000) // i=40000
11.209971 23809=tid f(bar, 80000) // i=60000
11.908678 23809=tid f(foo, 100000) // i=60000
12.170510 23809=tid f(bar, 80000) // i=80000
12.170557 23809=tid f(bar, 80000) // leave in 4.869083; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
12.627328 23809=tid f(foo, 100000) // i=80000
13.220404 23809=tid f(foo, 100000) // i=100000
13.220445 23809=tid f(foo, 100000) // leave in 5.933961; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=3
14.998897 23808=tid main() // input=word!
14.998929 23808=tid main() // leave in 14.998911; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
590 pprof.out
$ go tool pprof -top pprof.out
File: blocking
Type: cpu
Duration: 15s, Total samples = 10.22s (68.14%)
Showing nodes accounting for 10.19s, 99.71% of 10.22s total
Dropped 5 nodes (cum <= 0.05s)
flat flat% sum% cum cum%
10.19s 99.71% 99.71% 10.20s 99.80% main.f
0 0% 99.71% 4.27s 41.78% main.main
0 0% 99.71% 4.27s 41.78% runtime.main
-
Observation: The flat
10.19s
reflects the total accumulated time for functionf()
. -
Observation: The 'flat' seconds reported by
pprof
excludes delays due to pre-emption, GC, etc. -
Observation: The 1st & 2nd
pprof
'flat' seconds are very similar --9.67s
vs10.19s
-- and it's not possible to tell that the 2ndpprof
run took 3 seconds longer due to go-routine / LWP contention.
- Re-run experiment 2 with the execution trace which resulting in
trace.out
:
$ go build blocking.go && perl -e 'sleep 15; printf qq[word!\n];' | USE_TRACE=1 GOMAXPROCS_WANTED=1 ./blocking ; wc --bytes trace.out
0.000020 24170=tid main() // enter; .NumCPU()=16 .GOMAXPROCS(0)=16 .NumGoroutine()=1
0.000469 24170=tid main() // enabling trace
0.000816 24170=tid main() // .GOMAXPROCS(0)=16
0.000963 24170=tid main() // .GOMAXPROCS(0)=1
0.000976 24170=tid main() // sleeping
3.001165 24173=tid main() // waking
3.001239 24173=tid f(direct, 120000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
3.713804 24173=tid f(direct, 120000) // i=20000
4.427349 24173=tid f(direct, 120000) // i=40000
5.139586 24173=tid f(direct, 120000) // i=60000
5.852789 24173=tid f(direct, 120000) // i=80000
6.564084 24173=tid f(direct, 120000) // i=100000
7.277072 24173=tid f(direct, 120000) // i=120000
7.277126 24173=tid f(direct, 120000) // leave in 4.275888; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
7.277199 24173=tid main() // type in word + hit return
7.282649 24170=tid func(going without loop)
7.282725 24174=tid f(foo, 100000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=6
7.282777 24174=tid f(bar, 80000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=6
7.282788 24174=tid f(baz, 60000) // enter; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=6
8.411256 24174=tid f(baz, 60000) // i=20000
8.711731 24174=tid f(bar, 80000) // i=20000
8.987458 24174=tid f(foo, 100000) // i=20000
9.474273 24174=tid f(baz, 60000) // i=40000
10.176263 24174=tid f(bar, 80000) // i=40000
10.588592 24174=tid f(baz, 60000) // i=60000
10.588650 24174=tid f(baz, 60000) // leave in 3.305863; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=5
10.667806 24174=tid f(foo, 100000) // i=40000
11.258817 24174=tid f(bar, 80000) // i=60000
11.847173 24174=tid f(foo, 100000) // i=60000
12.198086 24174=tid f(bar, 80000) // i=80000
12.198140 24174=tid f(bar, 80000) // leave in 4.915364; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=4
12.621021 24174=tid f(foo, 100000) // i=80000
13.214052 24174=tid f(foo, 100000) // i=100000
13.214104 24174=tid f(foo, 100000) // leave in 5.931380; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=3
15.001921 24173=tid main() // input=word!
15.001955 24173=tid main() // leave in 15.001936; .NumCPU()=16 .GOMAXPROCS(0)=1 .NumGoroutine()=2
8988 trace.out
$ go tool trace trace.out
2021/05/28 15:46:00 Parsing trace...
2021/05/28 15:46:00 Splitting trace...
2021/05/28 15:46:00 Opening browser. Trace viewer is listening on http://127.0.0.1:60944
^C
- Browser webpage opens. Click on
Goroutine analysys
,main.f N=3
. - Text rendering of what the HTML page shows:
Goroutine Total Execution Network Sync Blocking Scheduler GC GC
wait block syscall wait sweeping pause
[] 6017ms 3002ms 0ns 0ns 0ns 3014ms 0ns (0.0%) 0ns (0.0%)
[] 4937ms 1921ms 0ns 0ns 0ns 3016ms 0ns (0.0%) 0ns (0.0%)
[] 3265ms 1087ms 0ns 0ns 0ns 2178ms 0ns (0.0%) 0ns (0.0%)
- Observation: The good news is that it shows all the timing info we're after :-)
- Observation: The bad news is that
trace.out
is about 20 times bigger thanpprof.out
, and this huge size could be prohibitive for larger Golang programs running for longer? - Observation: The other bad news is that it's all graphical to access this info, e.g. [1].
- Question: How to access the timing info without having to go HTML? Or without having to save the trace file which can get very / too large [2]?
[1] https://about.sourcegraph.com/go/an-introduction-to-go-tool-trace-rhys-hiltner/ [2] golang/go#21870