Last active
December 12, 2015 03:58
-
-
Save dchest/4710403 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Core 2 Duo | |
| benchmark old ns/op new ns/op delta | |
| BenchmarkRC4_128 817 614 -24.85% | |
| BenchmarkRC4_1K 6406 4967 -22.46% | |
| BenchmarkRC4_8K 50450 38976 -22.74% | |
| benchmark old MB/s new MB/s speedup | |
| BenchmarkRC4_128 156.59 208.35 1.33x | |
| BenchmarkRC4_1K 159.83 206.16 1.29x | |
| BenchmarkRC4_8K 160.47 207.71 1.29x | |
| UPDATE: turns out on Core i7 it's slower. | |
| Also, on Core 2 Duo 8-byte blocks thing doesn't help: removing it and rearranging code | |
| makes the code even faster. I'm done :) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Copyright 2013 The Go Authors. All rights reserved. | |
| // Use of this source code is governed by a BSD-style | |
| // license that can be found in the LICENSE file. | |
| // +build amd64 | |
| package rc4 | |
| import ( | |
| "unsafe" | |
| ) | |
| func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8) | |
| // XORKeyStream sets dst to the result of XORing src with the key stream. | |
| // Dst and src may be the same slice but otherwise should not overlap. | |
| func (c *Cipher) XORKeyStream(dst, src []byte) { | |
| // Original version, calling to assembly. | |
| /* | |
| if len(src) == 0 { | |
| return | |
| } | |
| xorKeyStream(&dst[0], &src[0], len(src), &c.s, &c.i, &c.j) | |
| */ | |
| // Go unsafe version. | |
| if len(dst) < len(src) { | |
| panic("rc4: dst is shorter than src") | |
| } | |
| i, j := c.i, c.j | |
| s := &c.s | |
| sp := uintptr(unsafe.Pointer(&src[0])) | |
| dp := uintptr(unsafe.Pointer(&dst[0])) | |
| sblockend := sp + uintptr(len(src) &^ 7) | |
| send := sp + uintptr(len(src)) | |
| // Process 8-byte blocks. | |
| for sp < sblockend { | |
| // Generate 8 bytes of RC4 stream into chunk. | |
| // 1. | |
| i += 1 | |
| a := s[i] | |
| j += a | |
| b := s[j] | |
| s[i], s[j] = b, a | |
| chunk := uint64(s[a+b]) | |
| // 2. | |
| i += 1 | |
| a = s[i] | |
| j += a | |
| b = s[j] | |
| s[i], s[j] = b, a | |
| chunk |= uint64(s[a+b]) << 8 | |
| // 3. | |
| i += 1 | |
| a = s[i] | |
| j += a | |
| b = s[j] | |
| s[i], s[j] = b, a | |
| chunk |= uint64(s[a+b]) << 16 | |
| // 4. | |
| i += 1 | |
| a = s[i] | |
| j += a | |
| b = s[j] | |
| s[i], s[j] = b, a | |
| chunk |= uint64(s[a+b]) << 24 | |
| // 5. | |
| i += 1 | |
| a = s[i] | |
| j += a | |
| b = s[j] | |
| s[i], s[j] = b, a | |
| chunk |= uint64(s[a+b]) << 32 | |
| // 6. | |
| i += 1 | |
| a = s[i] | |
| j += a | |
| b = s[j] | |
| s[i], s[j] = b, a | |
| chunk |= uint64(s[a+b]) << 40 | |
| // 7. | |
| i += 1 | |
| a = s[i] | |
| j += a | |
| b = s[j] | |
| s[i], s[j] = b, a | |
| chunk |= uint64(s[a+b]) << 48 | |
| // 8. | |
| i += 1 | |
| a = s[i] | |
| j += a | |
| b = s[j] | |
| s[i], s[j] = b, a | |
| chunk |= uint64(s[a+b]) << 56 | |
| // XOR chunk with 8 bytes from src and store in dst. | |
| *(*uint64)(unsafe.Pointer(dp)) = chunk ^ *(*uint64)(unsafe.Pointer(sp)) | |
| sp += 8 | |
| dp += 8 | |
| } | |
| for sp < send { | |
| i += 1 | |
| a := s[i] | |
| j += a | |
| b := s[j] | |
| s[i], s[j] = b, a | |
| *(*byte)(unsafe.Pointer(dp)) = s[a+b] ^ *(*byte)(unsafe.Pointer(sp)) | |
| sp++ | |
| dp++ | |
| } | |
| c.i, c.j = i, j | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment