Skip to content

Instantly share code, notes, and snippets.

@dchest
Last active December 12, 2015 03:58
Show Gist options
  • Select an option

  • Save dchest/4710403 to your computer and use it in GitHub Desktop.

Select an option

Save dchest/4710403 to your computer and use it in GitHub Desktop.
Core 2 Duo
benchmark old ns/op new ns/op delta
BenchmarkRC4_128 817 614 -24.85%
BenchmarkRC4_1K 6406 4967 -22.46%
BenchmarkRC4_8K 50450 38976 -22.74%
benchmark old MB/s new MB/s speedup
BenchmarkRC4_128 156.59 208.35 1.33x
BenchmarkRC4_1K 159.83 206.16 1.29x
BenchmarkRC4_8K 160.47 207.71 1.29x
UPDATE: turns out on Core i7 it's slower.
Also, on Core 2 Duo 8-byte blocks thing doesn't help: removing it and rearranging code
makes the code even faster. I'm done :)
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build amd64
package rc4
import (
"unsafe"
)
func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8)
// XORKeyStream sets dst to the result of XORing src with the key stream.
// Dst and src may be the same slice but otherwise should not overlap.
func (c *Cipher) XORKeyStream(dst, src []byte) {
// Original version, calling to assembly.
/*
if len(src) == 0 {
return
}
xorKeyStream(&dst[0], &src[0], len(src), &c.s, &c.i, &c.j)
*/
// Go unsafe version.
if len(dst) < len(src) {
panic("rc4: dst is shorter than src")
}
i, j := c.i, c.j
s := &c.s
sp := uintptr(unsafe.Pointer(&src[0]))
dp := uintptr(unsafe.Pointer(&dst[0]))
sblockend := sp + uintptr(len(src) &^ 7)
send := sp + uintptr(len(src))
// Process 8-byte blocks.
for sp < sblockend {
// Generate 8 bytes of RC4 stream into chunk.
// 1.
i += 1
a := s[i]
j += a
b := s[j]
s[i], s[j] = b, a
chunk := uint64(s[a+b])
// 2.
i += 1
a = s[i]
j += a
b = s[j]
s[i], s[j] = b, a
chunk |= uint64(s[a+b]) << 8
// 3.
i += 1
a = s[i]
j += a
b = s[j]
s[i], s[j] = b, a
chunk |= uint64(s[a+b]) << 16
// 4.
i += 1
a = s[i]
j += a
b = s[j]
s[i], s[j] = b, a
chunk |= uint64(s[a+b]) << 24
// 5.
i += 1
a = s[i]
j += a
b = s[j]
s[i], s[j] = b, a
chunk |= uint64(s[a+b]) << 32
// 6.
i += 1
a = s[i]
j += a
b = s[j]
s[i], s[j] = b, a
chunk |= uint64(s[a+b]) << 40
// 7.
i += 1
a = s[i]
j += a
b = s[j]
s[i], s[j] = b, a
chunk |= uint64(s[a+b]) << 48
// 8.
i += 1
a = s[i]
j += a
b = s[j]
s[i], s[j] = b, a
chunk |= uint64(s[a+b]) << 56
// XOR chunk with 8 bytes from src and store in dst.
*(*uint64)(unsafe.Pointer(dp)) = chunk ^ *(*uint64)(unsafe.Pointer(sp))
sp += 8
dp += 8
}
for sp < send {
i += 1
a := s[i]
j += a
b := s[j]
s[i], s[j] = b, a
*(*byte)(unsafe.Pointer(dp)) = s[a+b] ^ *(*byte)(unsafe.Pointer(sp))
sp++
dp++
}
c.i, c.j = i, j
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment