-
-
Save xogeny/b819af6a0cf8ba1caaef to your computer and use it in GitHub Desktop.
package copy_vs_append | |
import ( | |
"testing" | |
) | |
func TestCopy(t *testing.T) { | |
y := doCopy(true, false) | |
if len(y) != 1000 { | |
t.Fatalf("Expected len(y) to be 1000 but was %d", len(y)) | |
} | |
} | |
func TestAppend(t *testing.T) { | |
y := doCopy(false, false) | |
if len(y) != 1000 { | |
t.Fatalf("Expected len(y) to be 1000 but was %d", len(y)) | |
} | |
} | |
func TestAppendAlloc(t *testing.T) { | |
y := doCopy(false, true) | |
if len(y) != 1000 { | |
t.Fatalf("Expected len(y) to be 1000 but was %d", len(y)) | |
} | |
} | |
func doCopy(useCopy bool, preAlloc bool) []int64 { | |
existing := make([]int64, 1000, 1000) | |
var y []int64 | |
if useCopy { | |
y = make([]int64, 1000, 1000) | |
copy(y, existing) | |
} else { | |
var init []int64 | |
if preAlloc { | |
init = make([]int64, 0, 1000) | |
} else { | |
init = []int64{} | |
} | |
y = append(init, existing...) | |
} | |
return y | |
} | |
func BenchmarkAppend(b *testing.B) { | |
for i := 0; i < b.N; i++ { | |
doCopy(false, false) | |
} | |
} | |
func BenchmarkAppendAlloc(b *testing.B) { | |
for i := 0; i < b.N; i++ { | |
doCopy(false, true) | |
} | |
} | |
func BenchmarkAppendAllocInline(b *testing.B) { | |
for i := 0; i < b.N; i++ { | |
existing := make([]int64, 1000, 1000) | |
var init []int64 | |
init = make([]int64, 0, 1000) | |
_ = append(init, existing...) | |
} | |
} | |
func BenchmarkCopy(b *testing.B) { | |
for i := 0; i < b.N; i++ { | |
doCopy(true, true) | |
} | |
} |
anyone arriving here should test the benchmark on their system. On the latest GO pre-allocating and copying is faster:
BenchmarkAppend-4 1000000 1525 ns/op 8192 B/op 1 allocs/op
BenchmarkAppendAlloc-4 2000000 669 ns/op 0 B/op 0 allocs/op
BenchmarkAppendAllocInline-4 2000000 678 ns/op 0 B/op 0 allocs/op
BenchmarkCopy-4 2000000 675 ns/op 0 B/op 0 allocs/op
In my Ubuntu 18.04, Go version 1.13, CPUs 8, copy is faster:
goos: linux
goarch: amd64
pkg: github.com/alexyslozada/pruebas
BenchmarkAppend-8 846910 1284 ns/op
BenchmarkAppendAlloc-8 3836578 319 ns/op
BenchmarkAppendAllocInline-8 3863030 306 ns/op
BenchmarkCopy-8 3975591 300 ns/op
PASS
Debian 10, 4.19.0-8-amd64, Intel(R) Core(TM) i7-3930K CPU @ 3.20GHz
❯ go version
go version go1.12.14 linux/amd64
❯ go test -bench=. -benchmem
goos: linux
goarch: amd64
BenchmarkAppend-12 500000 2754 ns/op 8192 B/op 1 allocs/op
BenchmarkAppendAlloc-12 2000000 671 ns/op 0 B/op 0 allocs/op
BenchmarkAppendAllocInline-12 2000000 687 ns/op 0 B/op 0 allocs/op
BenchmarkCopy-12 2000000 697 ns/op 0 B/op 0 allocs/op
PASS
ok _/tmp/copy_vs_append 7.632s
❯ go version
go version go1.14 linux/amd64
❯ go test -bench=. -benchmem
goos: linux
goarch: amd64
BenchmarkAppend-12 363030 3560 ns/op 8192 B/op 1 allocs/op
BenchmarkAppendAlloc-12 1696290 680 ns/op 0 B/op 0 allocs/op
BenchmarkAppendAllocInline-12 1697086 706 ns/op 0 B/op 0 allocs/op
BenchmarkCopy-12 1710579 696 ns/op 0 B/op 0 allocs/op
PASS
ok _/tmp/copy_vs_append 6.917s
Ubuntu 20.04.1 LTS, 5.4.0-42-generic, Intel® Core™ i7-8665U CPU @ 1.90GHz × 8
> go version
go version go1.14.4 linux/amd64
> go test -bench=. -benchmem
goos: linux
goarch: amd64
BenchmarkAppend-8 1000000 1026 ns/op 8192 B/op 1 allocs/op
BenchmarkAppendAlloc-8 4462958 275 ns/op 0 B/op 0 allocs/op
BenchmarkAppendAllocInline-8 4360440 275 ns/op 0 B/op 0 allocs/op
BenchmarkCopy-8 4355202 275 ns/op 0 B/op 0 allocs/op
PASS
ok _/tmp/copy_vs_append 5.493s
Did some modification to the benchmark to meassure inline copy v.s. inline append for the pre-allocated case only (not counting the allocation cost of the 'existing' data to copy).
https://gist.github.com/smyrman/f7fd1734f9ea20d4648ed359bbcc6ac7
i believe this may be more relevant to compare. The benchmark code here include a number of branches (if statements
), which in themselves may be expensive (if predicted wrong), or at least expensive enough to affect the results.
Running
go test -bench=.
gives:The
PASS
simply indicates that thedoCopy
function actually performs a copy (worth making sure).So the really strange part is that using
copy
appears to take longer than anything else and the "inline" version looks extremely fast.But some strange stuff is going on here. The "inline" version clearly avoids the cost of a function call. But why is
copy
so slow? Well, it isn't...necessarily. If you rerun the benchmark you can also get results like this:Now
copy
is the fastest. But one thing I've noticed in repeated tests (OSX 10.9) is that the numbers for the "append" varieties have the least variability and the "copy" one changes quite a bit from run to run. So even though one can make an argument that "copy" is faster in some cases, you can also make the case that on average they are all about the same and the append versions are probably more predictable.But this isn't the end of the story...I did more benchmarks and they show some interesting differences.