Last active
November 21, 2018 05:30
-
-
Save klauspost/64b36e9904d76d6fc122 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright 2011 The Go Authors. All rights reserved. | |
// Copyright 2013 Klaus Post | |
// Use of this source code is governed by a BSD-style | |
// license that can be found in the LICENSE file. | |
package crc32 | |
import ( | |
"github.com/klauspost/intrinsics/x86/sse2" | |
"github.com/klauspost/intrinsics/x86" | |
"github.com/klauspost/intrinsics/x86/pclmulqdq" | |
"github.com/klauspost/intrinsics/x86/sse4" | |
) | |
// This file contains the code to call the SSE 4.2 version of the Castagnoli | |
// and IEEE CRC. | |
// haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and uses | |
// CPUID to test for SSE 4.1, 4.2 and CLMUL support. | |
func haveSSE41() bool | |
func haveSSE42() bool | |
func haveCLMUL() bool | |
// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32 | |
// instruction. | |
func castagnoliSSE42(crc uint32, p []byte) uint32 | |
var sse42 = haveSSE42() | |
var useFastIEEE = haveCLMUL() && haveSSE41() | |
func updateCastagnoli(crc uint32, p []byte) uint32 { | |
if sse42 { | |
return castagnoliSSE42(crc, p) | |
} | |
return update(crc, castagnoliTable, p) | |
} | |
func updateIEEE(crc uint32, p []byte) uint32 { | |
if useFastIEEE && len(p) >= 64 { | |
left := len(p) & 15 | |
do := len(p) - left | |
crc := ^ieeeCLMUL(^crc, p[:do]) | |
if left > 0 { | |
crc = update(crc, IEEETable, p[do:]) | |
} | |
return crc | |
} | |
// only use slicing-by-8 when input is >= 4KB | |
if len(p) >= 4096 { | |
iEEETable8Once.Do(func() { | |
iEEETable8 = makeTable8(IEEE) | |
}) | |
return updateSlicingBy8(crc, iEEETable8, p) | |
} | |
return update(crc, IEEETable, p) | |
} | |
// Update an IEEE crc32 Checksum. Based on | |
// http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf | |
// len(p) must be at least 64, and must be a multiple of 16. | |
func ieeeCLMUL(crc uint32, p []byte) uint32 { | |
in := x86.BytesToM128i(p) | |
crci := sse2.SetEpi64x(0,int64(crc)) | |
p1 := sse2.XorSi128(in[0], crci) | |
p2 := in[1] | |
p3 := in[2] | |
p4 := in[3] | |
for len(in) >= 4 { | |
var r1p2 = sse2.SetEpi64x(0x1c6e41596, 0x154442bd4) | |
t1 := pclmulqdq.Clmulepi64Si128(r1p2, p1, 0) | |
t2 := pclmulqdq.Clmulepi64Si128(r1p2, p2, 0) | |
t3 := pclmulqdq.Clmulepi64Si128(r1p2, p3, 0) | |
t4 := pclmulqdq.Clmulepi64Si128(r1p2, p4, 0) | |
t5 := pclmulqdq.Clmulepi64Si128(r1p2, p1, 0x11) | |
t6 := pclmulqdq.Clmulepi64Si128(r1p2, p2, 0x11) | |
t7 := pclmulqdq.Clmulepi64Si128(r1p2, p3, 0x11) | |
t8 := pclmulqdq.Clmulepi64Si128(r1p2, p4, 0x11) | |
t1 = sse2.XorSi128(t1, t5) | |
t2 = sse2.XorSi128(t2, t6) | |
t3 = sse2.XorSi128(t3, t7) | |
t4 = sse2.XorSi128(t4, t8) | |
p1 = sse2.XorSi128(t1, in[0]) | |
p2 = sse2.XorSi128(t2, in[1]) | |
p3 = sse2.XorSi128(t3, in[2]) | |
p4 = sse2.XorSi128(t4, in[3]) | |
in = in[4:] | |
} | |
/* Fold result into a single register (p1) */ | |
var r4r3 = sse2.SetEpi64x(0x0ccaa009e, 0x1751997d0 ) | |
// Merge p2 | |
t1 := pclmulqdq.Clmulepi64Si128(r4r3, p1, 0) | |
t2 := pclmulqdq.Clmulepi64Si128(r4r3, p1, 0x11) | |
p1 = sse2.XorSi128(t1, t2) | |
p1 = sse2.XorSi128(p1, p2) | |
// Merge p3 | |
t1 = pclmulqdq.Clmulepi64Si128(r4r3, p1, 0) | |
t2 = pclmulqdq.Clmulepi64Si128(r4r3, p1, 0x11) | |
p1 = sse2.XorSi128(t1, t2) | |
p1 = sse2.XorSi128(p1, p3) | |
// Merge p4 | |
t1 = pclmulqdq.Clmulepi64Si128(r4r3, p1, 0) | |
t2 = pclmulqdq.Clmulepi64Si128(r4r3, p1, 0x11) | |
p1 = sse2.XorSi128(t1, t2) | |
p1 = sse2.XorSi128(p1, p4) | |
// Encode remaining in 16 byte blocks | |
for len(in) > 0 { | |
t1 = pclmulqdq.Clmulepi64Si128(r4r3, p1, 0) | |
t2 = pclmulqdq.Clmulepi64Si128(r4r3, p1, 0x11) | |
p1 = sse2.XorSi128(t1, t2) | |
p1 = sse2.XorSi128(p1, in[0]) | |
in = in[1:] | |
} | |
// Merge result | |
t1 = pclmulqdq.Clmulepi64Si128(r4r3, p1, 0) | |
p1 = sse2.XorSi128(sse2.SrliSi128(p1, 8), t1) | |
mask := sse2.SrlEpi64(sse2.CmpeqEpi8(p1,p1), 32) | |
r5 := sse2.SetEpi64x(0, 0x163cd6124) | |
t1 = sse2.SrliSi128(p1, 4) | |
p1 = sse2.AndSi128(p1, mask) | |
p1 = pclmulqdq.Clmulepi64Si128(r5, p1, 0) | |
p1 = sse2.XorSi128(p1, t1) | |
rupoly := sse2.SetEpi64x(0x1db710641, 0x1f7011641) | |
t1 = sse2.AndSi128(p1, mask) | |
t1 = pclmulqdq.Clmulepi64Si128(rupoly, t1, 0x10) | |
t1 = sse2.AndSi128(t1, mask) | |
t1 = pclmulqdq.Clmulepi64Si128(rupoly, t1, 0x0) | |
p1 = sse2.XorSi128(p1, t1) | |
return sse4.ExtractEpi32(p1, 1) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment