Created
December 15, 2012 17:21
-
-
Save donovanhide/4297349 to your computer and use it in GitHub Desktop.
First attempt at fast summing of a slice of uint8 using SSE assembly
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
go test sum -v -benchtime=0.1s | |
=== RUN TestSmallSumBytes | |
--- PASS: TestSmallSumBytes (0.00 seconds) | |
=== RUN TestSumBytes | |
--- PASS: TestSumBytes (2.52 seconds) | |
=== RUN TestBenchmark | |
--- PASS: TestBenchmark (10.96 seconds) | |
sum_test.go:75: Benchmark Results | |
Length: 0 Fast: 5.42 ns/op Slow: 4.93 ns/op Improvement -9.03% | |
Length: 1 Fast: 7.56 ns/op Slow: 5.26 ns/op Improvement -30.39% | |
Length: 2 Fast: 7.81 ns/op Slow: 5.88 ns/op Improvement -24.71% | |
Length: 3 Fast: 7.62 ns/op Slow: 7.04 ns/op Improvement -7.57% | |
Length: 4 Fast: 7.69 ns/op Slow: 8.07 ns/op Improvement 4.88% | |
Length: 5 Fast: 8.02 ns/op Slow: 8.97 ns/op Improvement 11.92% | |
Length: 6 Fast: 7.68 ns/op Slow: 10.09 ns/op Improvement 31.37% | |
Length: 7 Fast: 9.74 ns/op Slow: 11.31 ns/op Improvement 16.12% | |
Length: 8 Fast: 7.84 ns/op Slow: 12.65 ns/op Improvement 61.35% | |
Length: 15 Fast: 7.70 ns/op Slow: 21.91 ns/op Improvement 184.54% | |
Length: 16 Fast: 8.02 ns/op Slow: 23.20 ns/op Improvement 189.05% | |
Length: 31 Fast: 9.33 ns/op Slow: 42.76 ns/op Improvement 358.28% | |
Length: 32 Fast: 7.91 ns/op Slow: 43.11 ns/op Improvement 444.86% | |
Length: 63 Fast: 12.29 ns/op Slow: 81.86 ns/op Improvement 566.01% | |
Length: 64 Fast: 10.50 ns/op Slow: 86.07 ns/op Improvement 719.70% | |
Length: 100 Fast: 15.70 ns/op Slow: 106.16 ns/op Improvement 576.10% | |
Length: 135 Fast: 18.97 ns/op Slow: 146.31 ns/op Improvement 671.24% | |
Length: 256 Fast: 38.62 ns/op Slow: 263.35 ns/op Improvement 581.92% | |
Length: 1024 Fast: 112.32 ns/op Slow: 1000.87 ns/op Improvement 791.06% | |
Length: 65536 Fast: 6490.84 ns/op Slow: 62188.00 ns/op Improvement 858.09% | |
PASS | |
ok sum 13.611s |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DATA lut+0x00(SB)/8, $0x00000000000000FF | |
DATA lut+0x08(SB)/8, $0x0000000000000000 | |
DATA lut+0x10(SB)/8, $0x000000000000FFFF | |
DATA lut+0x18(SB)/8, $0x0000000000000000 | |
DATA lut+0x20(SB)/8, $0x0000000000FFFFFF | |
DATA lut+0x28(SB)/8, $0x0000000000000000 | |
DATA lut+0x30(SB)/8, $0x00000000FFFFFFFF | |
DATA lut+0x38(SB)/8, $0x0000000000000000 | |
DATA lut+0x40(SB)/8, $0x000000FFFFFFFFFF | |
DATA lut+0x48(SB)/8, $0x0000000000000000 | |
DATA lut+0x50(SB)/8, $0x0000FFFFFFFFFFFF | |
DATA lut+0x58(SB)/8, $0x0000000000000000 | |
DATA lut+0x60(SB)/8, $0x00FFFFFFFFFFFFFF | |
DATA lut+0x68(SB)/8, $0x0000000000000000 | |
DATA lut+0x70(SB)/8, $0xFFFFFFFFFFFFFFFF | |
DATA lut+0x78(SB)/8, $0x0000000000000000 | |
DATA lut+0x80(SB)/8, $0xFFFFFFFFFFFFFFFF | |
DATA lut+0x88(SB)/8, $0x00000000000000FF | |
DATA lut+0x90(SB)/8, $0xFFFFFFFFFFFFFFFF | |
DATA lut+0x98(SB)/8, $0x000000000000FFFF | |
DATA lut+0xA0(SB)/8, $0xFFFFFFFFFFFFFFFF | |
DATA lut+0xA8(SB)/8, $0x0000000000FFFFFF | |
DATA lut+0xB0(SB)/8, $0xFFFFFFFFFFFFFFFF | |
DATA lut+0xB8(SB)/8, $0x00000000FFFFFFFF | |
DATA lut+0xC0(SB)/8, $0xFFFFFFFFFFFFFFFF | |
DATA lut+0xC8(SB)/8, $0x000000FFFFFFFFFF | |
DATA lut+0xD0(SB)/8, $0xFFFFFFFFFFFFFFFF | |
DATA lut+0xD8(SB)/8, $0x0000FFFFFFFFFFFF | |
DATA lut+0xE0(SB)/8, $0xFFFFFFFFFFFFFFFF | |
DATA lut+0xE8(SB)/8, $0x00FFFFFFFFFFFFFF | |
DATA lut+0xF0(SB)/8, $0xFFFFFFFFFFFFFFFF | |
DATA lut+0xF8(SB)/8, $0xFFFFFFFFFFFFFFFF | |
GLOBL lut(SB), $0x100 | |
// func FastSumUint8(x []uint8) uint64 | |
TEXT ·FastSumUint8(SB),7,$0 | |
MOVQ x+0(FP), SI // data pointer | |
MOVQ x+8(FP), CX // len(x) | |
XORQ AX, AX // result | |
PXOR X0,X0 // parallel total | |
CMPQ CX,$0 | |
// Zero Length | |
JE done | |
// Less than 16 bytes | |
CMPQ CX,$16 | |
JLE small | |
large: | |
PXOR X1, X1 | |
PSADBW (SI),X1 | |
PADDQ X1,X0 | |
ADDQ $16, SI | |
SUBQ $16, CX | |
CMPQ CX, $16 | |
JGE large | |
CMPQ CX,$0 | |
JE total | |
small: | |
MOVOU (SI),X1 | |
PXOR X3, X3 | |
SUBQ $1,CX | |
ADDQ CX,CX | |
MOVOU lut(SB)(CX*8),X2 | |
PAND X2,X1 | |
PSADBW X1,X3 | |
PADDQ X3,X0 | |
total: | |
MOVQ X0,BX | |
MOVHLPS X0,X1 | |
MOVQ X1,AX | |
ADDQ BX,AX | |
done: | |
MOVQ AX,ret+24(FP) | |
RET | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package sum | |
import ( | |
"fmt" | |
"math/rand" | |
"reflect" | |
"testing" | |
"testing/quick" | |
) | |
func FastSumUint8(x []uint8) uint64 | |
func SumUint8(x []uint8) uint64 { | |
sum := uint64(0) | |
for _, v := range x { | |
sum += uint64(v) | |
} | |
return sum | |
} | |
type TestSlice []uint8 | |
func buildTestSlice(length int, capacity int) TestSlice { | |
test := make(TestSlice, length, capacity) | |
for i := range test { | |
test[i] = uint8(rand.Uint32()) | |
} | |
return test | |
} | |
func (s TestSlice) Generate(rand *rand.Rand, size int) reflect.Value { | |
length := rand.Int() % (1 << 16) | |
return reflect.ValueOf(buildTestSlice(length, length+rand.Int()%1024)) | |
} | |
func TestSmallSumBytes(t *testing.T) { | |
slice := buildTestSlice(32, 32) | |
for i := range slice { | |
slow := SumUint8(slice[:i]) | |
fast := FastSumUint8(slice[:i]) | |
if fast != slow { | |
t.Errorf("%v %v Fast: %v Slow: %v", i, slice[:i], fast, slow) | |
} | |
} | |
} | |
func TestSumBytes(t *testing.T) { | |
config := &quick.Config{MaxCount: 1 << 10} | |
fast := func(x TestSlice) uint64 { return FastSumUint8(x) } | |
slow := func(x TestSlice) uint64 { return SumUint8(x) } | |
if err := quick.CheckEqual(fast, slow, config); err != nil { | |
t.Error(err) | |
} | |
} | |
func runBenchmark(f func(x []uint8) uint64, numbers []uint8, b *testing.B) { | |
b.ResetTimer() | |
for i := 0; i < b.N; i++ { | |
f(numbers) | |
} | |
} | |
func TestBenchmark(t *testing.T) { | |
benchmarks := []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 31, 32, 63, 64, 100, 135, 256, 1 << 10, 1 << 16} | |
results := "Benchmark Results\n" | |
for _, n := range benchmarks { | |
slice := buildTestSlice(n, n) | |
fast := testing.Benchmark(func(b *testing.B) { runBenchmark(FastSumUint8, slice, b) }) | |
slow := testing.Benchmark(func(b *testing.B) { runBenchmark(SumUint8, slice, b) }) | |
fastNsPerOp := float64(fast.T.Nanoseconds()) / float64(fast.N) | |
slowNsPerOp := float64(slow.T.Nanoseconds()) / float64(slow.N) | |
results += fmt.Sprintf("Length: %8d Fast:%12.2f ns/op Slow:%12.2f ns/op Improvement %6.2f%%\n", n, fastNsPerOp, slowNsPerOp, ((1/(fastNsPerOp/slowNsPerOp))-1)*100) | |
} | |
t.Log(results) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment