|
diff --git a/src/blake2/blake2-config.h b/src/blake2/blake2-config.h |
|
new file mode 100644 |
|
index 0000000..a524aa9 |
|
--- /dev/null |
|
+++ b/src/blake2/blake2-config.h |
|
@@ -0,0 +1,72 @@ |
|
+/* |
|
+ BLAKE2 reference source code package - optimized C implementations |
|
+ |
|
+ Copyright 2012, Samuel Neves <[email protected]>. You may use this under the |
|
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at |
|
+ your option. The terms of these licenses can be found at: |
|
+ |
|
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 |
|
+ - OpenSSL license : https://www.openssl.org/source/license.html |
|
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 |
|
+ |
|
+ More information about the BLAKE2 hash function can be found at |
|
+ https://blake2.net. |
|
+*/ |
|
+#ifndef BLAKE2_CONFIG_H |
|
+#define BLAKE2_CONFIG_H |
|
+ |
|
+/* These don't work everywhere */ |
|
+#if defined(__SSE2__) || defined(__x86_64__) || defined(__amd64__) |
|
+#define HAVE_SSE2 |
|
+#endif |
|
+ |
|
+#if defined(__SSSE3__) |
|
+#define HAVE_SSSE3 |
|
+#endif |
|
+ |
|
+#if defined(__SSE4_1__) |
|
+#define HAVE_SSE41 |
|
+#endif |
|
+ |
|
+#if defined(__AVX__) |
|
+#define HAVE_AVX |
|
+#endif |
|
+ |
|
+#if defined(__XOP__) |
|
+#define HAVE_XOP |
|
+#endif |
|
+ |
|
+ |
|
+#ifdef HAVE_AVX2 |
|
+#ifndef HAVE_AVX |
|
+#define HAVE_AVX |
|
+#endif |
|
+#endif |
|
+ |
|
+#ifdef HAVE_XOP |
|
+#ifndef HAVE_AVX |
|
+#define HAVE_AVX |
|
+#endif |
|
+#endif |
|
+ |
|
+#ifdef HAVE_AVX |
|
+#ifndef HAVE_SSE41 |
|
+#define HAVE_SSE41 |
|
+#endif |
|
+#endif |
|
+ |
|
+#ifdef HAVE_SSE41 |
|
+#ifndef HAVE_SSSE3 |
|
+#define HAVE_SSSE3 |
|
+#endif |
|
+#endif |
|
+ |
|
+#ifdef HAVE_SSSE3 |
|
+#define HAVE_SSE2 |
|
+#endif |
|
+ |
|
+#if !defined(HAVE_SSE2) |
|
+#error "This code requires at least SSE2." |
|
+#endif |
|
+ |
|
+#endif |
|
diff --git a/src/blake2/blake2-impl.h b/src/blake2/blake2-impl.h |
|
index 32ff316..c1df82e 100644 |
|
--- a/src/blake2/blake2-impl.h |
|
+++ b/src/blake2/blake2-impl.h |
|
@@ -5,9 +5,9 @@ |
|
terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at |
|
your option. The terms of these licenses can be found at: |
|
|
|
- - CC0 1.0 Universal : https://creativecommons.org/publicdomain/zero/1.0 |
|
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 |
|
- OpenSSL license : https://www.openssl.org/source/license.html |
|
- - Apache 2.0 : https://www.apache.org/licenses/LICENSE-2.0 |
|
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 |
|
|
|
More information about the BLAKE2 hash function can be found at |
|
https://blake2.net. |
|
@@ -72,8 +72,8 @@ static BLAKE2_INLINE uint16_t load16( const void *src ) |
|
return w; |
|
#else |
|
const uint8_t *p = ( const uint8_t * )src; |
|
- return (( uint16_t )( p[0] ) << 0) | |
|
- (( uint16_t )( p[1] ) << 8) ; |
|
+ return ( uint16_t )((( uint32_t )( p[0] ) << 0) | |
|
+ (( uint32_t )( p[1] ) << 8)); |
|
#endif |
|
} |
|
|
|
diff --git a/src/blake2/blake2.h b/src/blake2/blake2.h |
|
index d25d5fd..ad62f26 100644 |
|
--- a/src/blake2/blake2.h |
|
+++ b/src/blake2/blake2.h |
|
@@ -5,9 +5,9 @@ |
|
terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at |
|
your option. The terms of these licenses can be found at: |
|
|
|
- - CC0 1.0 Universal : https://creativecommons.org/publicdomain/zero/1.0 |
|
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 |
|
- OpenSSL license : https://www.openssl.org/source/license.html |
|
- - Apache 2.0 : https://www.apache.org/licenses/LICENSE-2.0 |
|
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 |
|
|
|
More information about the BLAKE2 hash function can be found at |
|
https://blake2.net. |
|
diff --git a/src/blake2/blake2b-load-sse2.h b/src/blake2/blake2b-load-sse2.h |
|
new file mode 100644 |
|
index 0000000..23a8d40 |
|
--- /dev/null |
|
+++ b/src/blake2/blake2b-load-sse2.h |
|
@@ -0,0 +1,68 @@ |
|
+/* |
|
+ BLAKE2 reference source code package - optimized C implementations |
|
+ |
|
+ Copyright 2012, Samuel Neves <[email protected]>. You may use this under the |
|
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at |
|
+ your option. The terms of these licenses can be found at: |
|
+ |
|
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 |
|
+ - OpenSSL license : https://www.openssl.org/source/license.html |
|
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 |
|
+ |
|
+ More information about the BLAKE2 hash function can be found at |
|
+ https://blake2.net. |
|
+*/ |
|
+#ifndef BLAKE2B_LOAD_SSE2_H |
|
+#define BLAKE2B_LOAD_SSE2_H |
|
+ |
|
+#define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4) |
|
+#define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5) |
|
+#define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12) |
|
+#define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13) |
|
+#define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9) |
|
+#define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15) |
|
+#define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11) |
|
+#define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7) |
|
+#define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5) |
|
+#define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2) |
|
+#define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7) |
|
+#define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1) |
|
+#define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13) |
|
+#define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12) |
|
+#define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4) |
|
+#define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0) |
|
+#define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2) |
|
+#define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4) |
|
+#define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6) |
|
+#define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8) |
|
+#define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0) |
|
+#define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11) |
|
+#define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15) |
|
+#define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14) |
|
+#define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14) |
|
+#define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13) |
|
+#define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9) |
|
+#define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2) |
|
+#define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12) |
|
+#define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1) |
|
+#define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8) |
|
+#define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6) |
|
+#define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11) |
|
+#define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3) |
|
+#define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1) |
|
+#define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4) |
|
+#define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7) |
|
+#define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6) |
|
+#define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3) |
|
+#define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12) |
|
+#define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4) |
|
+#define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5) |
|
+#define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12) |
|
+#define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13) |
|
+#define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9) |
|
+#define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15) |
|
+#define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11) |
|
+#define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7) |
|
+ |
|
+ |
|
+#endif |
|
diff --git a/src/blake2/blake2b-load-sse41.h b/src/blake2/blake2b-load-sse41.h |
|
new file mode 100644 |
|
index 0000000..0eca865 |
|
--- /dev/null |
|
+++ b/src/blake2/blake2b-load-sse41.h |
|
@@ -0,0 +1,402 @@ |
|
+/* |
|
+ BLAKE2 reference source code package - optimized C implementations |
|
+ |
|
+ Copyright 2012, Samuel Neves <[email protected]>. You may use this under the |
|
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at |
|
+ your option. The terms of these licenses can be found at: |
|
+ |
|
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 |
|
+ - OpenSSL license : https://www.openssl.org/source/license.html |
|
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 |
|
+ |
|
+ More information about the BLAKE2 hash function can be found at |
|
+ https://blake2.net. |
|
+*/ |
|
+#ifndef BLAKE2B_LOAD_SSE41_H |
|
+#define BLAKE2B_LOAD_SSE41_H |
|
+ |
|
+#define LOAD_MSG_0_1(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m0, m1); \ |
|
+b1 = _mm_unpacklo_epi64(m2, m3); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_0_2(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpackhi_epi64(m0, m1); \ |
|
+b1 = _mm_unpackhi_epi64(m2, m3); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_0_3(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m4, m5); \ |
|
+b1 = _mm_unpacklo_epi64(m6, m7); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_0_4(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpackhi_epi64(m4, m5); \ |
|
+b1 = _mm_unpackhi_epi64(m6, m7); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_1_1(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m7, m2); \ |
|
+b1 = _mm_unpackhi_epi64(m4, m6); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_1_2(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m5, m4); \ |
|
+b1 = _mm_alignr_epi8(m3, m7, 8); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_1_3(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ |
|
+b1 = _mm_unpackhi_epi64(m5, m2); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_1_4(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m6, m1); \ |
|
+b1 = _mm_unpackhi_epi64(m3, m1); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_2_1(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_alignr_epi8(m6, m5, 8); \ |
|
+b1 = _mm_unpackhi_epi64(m2, m7); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_2_2(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m4, m0); \ |
|
+b1 = _mm_blend_epi16(m1, m6, 0xF0); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_2_3(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_blend_epi16(m5, m1, 0xF0); \ |
|
+b1 = _mm_unpackhi_epi64(m3, m4); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_2_4(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m7, m3); \ |
|
+b1 = _mm_alignr_epi8(m2, m0, 8); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_3_1(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpackhi_epi64(m3, m1); \ |
|
+b1 = _mm_unpackhi_epi64(m6, m5); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_3_2(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpackhi_epi64(m4, m0); \ |
|
+b1 = _mm_unpacklo_epi64(m6, m7); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_3_3(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_blend_epi16(m1, m2, 0xF0); \ |
|
+b1 = _mm_blend_epi16(m2, m7, 0xF0); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_3_4(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m3, m5); \ |
|
+b1 = _mm_unpacklo_epi64(m0, m4); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_4_1(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpackhi_epi64(m4, m2); \ |
|
+b1 = _mm_unpacklo_epi64(m1, m5); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_4_2(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_blend_epi16(m0, m3, 0xF0); \ |
|
+b1 = _mm_blend_epi16(m2, m7, 0xF0); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_4_3(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_blend_epi16(m7, m5, 0xF0); \ |
|
+b1 = _mm_blend_epi16(m3, m1, 0xF0); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_4_4(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_alignr_epi8(m6, m0, 8); \ |
|
+b1 = _mm_blend_epi16(m4, m6, 0xF0); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_5_1(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m1, m3); \ |
|
+b1 = _mm_unpacklo_epi64(m0, m4); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_5_2(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m6, m5); \ |
|
+b1 = _mm_unpackhi_epi64(m5, m1); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_5_3(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_blend_epi16(m2, m3, 0xF0); \ |
|
+b1 = _mm_unpackhi_epi64(m7, m0); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_5_4(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpackhi_epi64(m6, m2); \ |
|
+b1 = _mm_blend_epi16(m7, m4, 0xF0); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_6_1(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_blend_epi16(m6, m0, 0xF0); \ |
|
+b1 = _mm_unpacklo_epi64(m7, m2); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_6_2(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpackhi_epi64(m2, m7); \ |
|
+b1 = _mm_alignr_epi8(m5, m6, 8); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_6_3(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m0, m3); \ |
|
+b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_6_4(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpackhi_epi64(m3, m1); \ |
|
+b1 = _mm_blend_epi16(m1, m5, 0xF0); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_7_1(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpackhi_epi64(m6, m3); \ |
|
+b1 = _mm_blend_epi16(m6, m1, 0xF0); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_7_2(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_alignr_epi8(m7, m5, 8); \ |
|
+b1 = _mm_unpackhi_epi64(m0, m4); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_7_3(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpackhi_epi64(m2, m7); \ |
|
+b1 = _mm_unpacklo_epi64(m4, m1); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_7_4(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m0, m2); \ |
|
+b1 = _mm_unpacklo_epi64(m3, m5); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_8_1(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m3, m7); \ |
|
+b1 = _mm_alignr_epi8(m0, m5, 8); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_8_2(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpackhi_epi64(m7, m4); \ |
|
+b1 = _mm_alignr_epi8(m4, m1, 8); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_8_3(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = m6; \ |
|
+b1 = _mm_alignr_epi8(m5, m0, 8); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_8_4(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_blend_epi16(m1, m3, 0xF0); \ |
|
+b1 = m2; \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_9_1(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m5, m4); \ |
|
+b1 = _mm_unpackhi_epi64(m3, m0); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_9_2(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m1, m2); \ |
|
+b1 = _mm_blend_epi16(m3, m2, 0xF0); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_9_3(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpackhi_epi64(m7, m4); \ |
|
+b1 = _mm_unpackhi_epi64(m1, m6); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_9_4(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_alignr_epi8(m7, m5, 8); \ |
|
+b1 = _mm_unpacklo_epi64(m6, m0); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_10_1(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m0, m1); \ |
|
+b1 = _mm_unpacklo_epi64(m2, m3); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_10_2(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpackhi_epi64(m0, m1); \ |
|
+b1 = _mm_unpackhi_epi64(m2, m3); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_10_3(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m4, m5); \ |
|
+b1 = _mm_unpacklo_epi64(m6, m7); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_10_4(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpackhi_epi64(m4, m5); \ |
|
+b1 = _mm_unpackhi_epi64(m6, m7); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_11_1(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m7, m2); \ |
|
+b1 = _mm_unpackhi_epi64(m4, m6); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_11_2(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m5, m4); \ |
|
+b1 = _mm_alignr_epi8(m3, m7, 8); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_11_3(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ |
|
+b1 = _mm_unpackhi_epi64(m5, m2); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#define LOAD_MSG_11_4(b0, b1) \ |
|
+do \ |
|
+{ \ |
|
+b0 = _mm_unpacklo_epi64(m6, m1); \ |
|
+b1 = _mm_unpackhi_epi64(m3, m1); \ |
|
+} while(0) |
|
+ |
|
+ |
|
+#endif |
|
diff --git a/src/blake2/blake2b-ref.c b/src/blake2/blake2b-ref.c |
|
index cbb1d8b..c8c1c5f 100644 |
|
--- a/src/blake2/blake2b-ref.c |
|
+++ b/src/blake2/blake2b-ref.c |
|
@@ -1,13 +1,13 @@ |
|
/* |
|
- BLAKE2 reference source code package - reference C implementations |
|
+ BLAKE2 reference source code package - optimized C implementations |
|
|
|
Copyright 2012, Samuel Neves <[email protected]>. You may use this under the |
|
terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at |
|
your option. The terms of these licenses can be found at: |
|
|
|
- - CC0 1.0 Universal : https://creativecommons.org/publicdomain/zero/1.0 |
|
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 |
|
- OpenSSL license : https://www.openssl.org/source/license.html |
|
- - Apache 2.0 : https://www.apache.org/licenses/LICENSE-2.0 |
|
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 |
|
|
|
More information about the BLAKE2 hash function can be found at |
|
https://blake2.net. |
|
@@ -20,6 +20,27 @@ |
|
#include "blake2.h" |
|
#include "blake2-impl.h" |
|
|
|
+#include "blake2-config.h" |
|
+ |
|
+#ifdef _MSC_VER |
|
+#include <intrin.h> /* for _mm_set_epi64x */ |
|
+#endif |
|
+#include <emmintrin.h> |
|
+#if defined(HAVE_SSSE3) |
|
+#include <tmmintrin.h> |
|
+#endif |
|
+#if defined(HAVE_SSE41) |
|
+#include <smmintrin.h> |
|
+#endif |
|
+#if defined(HAVE_AVX) |
|
+#include <immintrin.h> |
|
+#endif |
|
+#if defined(HAVE_XOP) |
|
+#include <x86intrin.h> |
|
+#endif |
|
+ |
|
+#include "blake2b-round.h" |
|
+ |
|
static const uint64_t blake2b_IV[8] = |
|
{ |
|
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, |
|
@@ -28,29 +49,12 @@ static const uint64_t blake2b_IV[8] = |
|
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL |
|
}; |
|
|
|
-static const uint8_t blake2b_sigma[12][16] = |
|
-{ |
|
- { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , |
|
- { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , |
|
- { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , |
|
- { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , |
|
- { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , |
|
- { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , |
|
- { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , |
|
- { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , |
|
- { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , |
|
- { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , |
|
- { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , |
|
- { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } |
|
-}; |
|
- |
|
- |
|
+/* Some helper functions */ |
|
static void blake2b_set_lastnode( blake2b_state *S ) |
|
{ |
|
S->f[1] = (uint64_t)-1; |
|
} |
|
|
|
-/* Some helper functions, not necessarily useful */ |
|
static int blake2b_is_lastblock( const blake2b_state *S ) |
|
{ |
|
return S->f[0] != 0; |
|
@@ -69,32 +73,25 @@ static void blake2b_increment_counter( blake2b_state *S, const uint64_t inc ) |
|
S->t[1] += ( S->t[0] < inc ); |
|
} |
|
|
|
-static void blake2b_init0( blake2b_state *S ) |
|
-{ |
|
- size_t i; |
|
- memset( S, 0, sizeof( blake2b_state ) ); |
|
- |
|
- for( i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i]; |
|
-} |
|
- |
|
/* init xors IV with input parameter block */ |
|
int blake2b_init_param( blake2b_state *S, const blake2b_param *P ) |
|
{ |
|
- const uint8_t *p = ( const uint8_t * )( P ); |
|
size_t i; |
|
- |
|
- blake2b_init0( S ); |
|
- |
|
+ /*blake2b_init0( S ); */ |
|
+ const unsigned char * v = ( const unsigned char * )( blake2b_IV ); |
|
+ const unsigned char * p = ( const unsigned char * )( P ); |
|
+ unsigned char * h = ( unsigned char * )( S->h ); |
|
/* IV XOR ParamBlock */ |
|
- for( i = 0; i < 8; ++i ) |
|
- S->h[i] ^= load64( p + sizeof( S->h[i] ) * i ); |
|
+ memset( S, 0, sizeof( blake2b_state ) ); |
|
+ |
|
+ for( i = 0; i < BLAKE2B_OUTBYTES; ++i ) h[i] = v[i] ^ p[i]; |
|
|
|
S->outlen = P->digest_length; |
|
return 0; |
|
} |
|
|
|
|
|
- |
|
+/* Some sort of default parameter block initialization, for sequential blake2b */ |
|
int blake2b_init( blake2b_state *S, size_t outlen ) |
|
{ |
|
blake2b_param P[1]; |
|
@@ -113,17 +110,17 @@ int blake2b_init( blake2b_state *S, size_t outlen ) |
|
memset( P->reserved, 0, sizeof( P->reserved ) ); |
|
memset( P->salt, 0, sizeof( P->salt ) ); |
|
memset( P->personal, 0, sizeof( P->personal ) ); |
|
+ |
|
return blake2b_init_param( S, P ); |
|
} |
|
|
|
- |
|
int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen ) |
|
{ |
|
blake2b_param P[1]; |
|
|
|
if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1; |
|
|
|
- if ( !key || !keylen || keylen > BLAKE2B_KEYBYTES ) return -1; |
|
+ if ( ( !keylen ) || keylen > BLAKE2B_KEYBYTES ) return -1; |
|
|
|
P->digest_length = (uint8_t)outlen; |
|
P->key_length = (uint8_t)keylen; |
|
@@ -138,7 +135,8 @@ int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t k |
|
memset( P->salt, 0, sizeof( P->salt ) ); |
|
memset( P->personal, 0, sizeof( P->personal ) ); |
|
|
|
- if( blake2b_init_param( S, P ) < 0 ) return -1; |
|
+ if( blake2b_init_param( S, P ) < 0 ) |
|
+ return 0; |
|
|
|
{ |
|
uint8_t block[BLAKE2B_BLOCKBYTES]; |
|
@@ -150,53 +148,53 @@ int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t k |
|
return 0; |
|
} |
|
|
|
-#define G(r,i,a,b,c,d) \ |
|
- do { \ |
|
- a = a + b + m[blake2b_sigma[r][2*i+0]]; \ |
|
- d = rotr64(d ^ a, 32); \ |
|
- c = c + d; \ |
|
- b = rotr64(b ^ c, 24); \ |
|
- a = a + b + m[blake2b_sigma[r][2*i+1]]; \ |
|
- d = rotr64(d ^ a, 16); \ |
|
- c = c + d; \ |
|
- b = rotr64(b ^ c, 63); \ |
|
- } while(0) |
|
- |
|
-#define ROUND(r) \ |
|
- do { \ |
|
- G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ |
|
- G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ |
|
- G(r,2,v[ 2],v[ 6],v[10],v[14]); \ |
|
- G(r,3,v[ 3],v[ 7],v[11],v[15]); \ |
|
- G(r,4,v[ 0],v[ 5],v[10],v[15]); \ |
|
- G(r,5,v[ 1],v[ 6],v[11],v[12]); \ |
|
- G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ |
|
- G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ |
|
- } while(0) |
|
- |
|
static void blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] ) |
|
{ |
|
- uint64_t m[16]; |
|
- uint64_t v[16]; |
|
- size_t i; |
|
- |
|
- for( i = 0; i < 16; ++i ) { |
|
- m[i] = load64( block + i * sizeof( m[i] ) ); |
|
- } |
|
- |
|
- for( i = 0; i < 8; ++i ) { |
|
- v[i] = S->h[i]; |
|
- } |
|
- |
|
- v[ 8] = blake2b_IV[0]; |
|
- v[ 9] = blake2b_IV[1]; |
|
- v[10] = blake2b_IV[2]; |
|
- v[11] = blake2b_IV[3]; |
|
- v[12] = blake2b_IV[4] ^ S->t[0]; |
|
- v[13] = blake2b_IV[5] ^ S->t[1]; |
|
- v[14] = blake2b_IV[6] ^ S->f[0]; |
|
- v[15] = blake2b_IV[7] ^ S->f[1]; |
|
- |
|
+ __m128i row1l, row1h; |
|
+ __m128i row2l, row2h; |
|
+ __m128i row3l, row3h; |
|
+ __m128i row4l, row4h; |
|
+ __m128i b0, b1; |
|
+ __m128i t0, t1; |
|
+#if defined(HAVE_SSSE3) && !defined(HAVE_XOP) |
|
+ const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 ); |
|
+ const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 ); |
|
+#endif |
|
+#if defined(HAVE_SSE41) |
|
+ const __m128i m0 = LOADU( block + 00 ); |
|
+ const __m128i m1 = LOADU( block + 16 ); |
|
+ const __m128i m2 = LOADU( block + 32 ); |
|
+ const __m128i m3 = LOADU( block + 48 ); |
|
+ const __m128i m4 = LOADU( block + 64 ); |
|
+ const __m128i m5 = LOADU( block + 80 ); |
|
+ const __m128i m6 = LOADU( block + 96 ); |
|
+ const __m128i m7 = LOADU( block + 112 ); |
|
+#else |
|
+ const uint64_t m0 = load64(block + 0 * sizeof(uint64_t)); |
|
+ const uint64_t m1 = load64(block + 1 * sizeof(uint64_t)); |
|
+ const uint64_t m2 = load64(block + 2 * sizeof(uint64_t)); |
|
+ const uint64_t m3 = load64(block + 3 * sizeof(uint64_t)); |
|
+ const uint64_t m4 = load64(block + 4 * sizeof(uint64_t)); |
|
+ const uint64_t m5 = load64(block + 5 * sizeof(uint64_t)); |
|
+ const uint64_t m6 = load64(block + 6 * sizeof(uint64_t)); |
|
+ const uint64_t m7 = load64(block + 7 * sizeof(uint64_t)); |
|
+ const uint64_t m8 = load64(block + 8 * sizeof(uint64_t)); |
|
+ const uint64_t m9 = load64(block + 9 * sizeof(uint64_t)); |
|
+ const uint64_t m10 = load64(block + 10 * sizeof(uint64_t)); |
|
+ const uint64_t m11 = load64(block + 11 * sizeof(uint64_t)); |
|
+ const uint64_t m12 = load64(block + 12 * sizeof(uint64_t)); |
|
+ const uint64_t m13 = load64(block + 13 * sizeof(uint64_t)); |
|
+ const uint64_t m14 = load64(block + 14 * sizeof(uint64_t)); |
|
+ const uint64_t m15 = load64(block + 15 * sizeof(uint64_t)); |
|
+#endif |
|
+ row1l = LOADU( &S->h[0] ); |
|
+ row1h = LOADU( &S->h[2] ); |
|
+ row2l = LOADU( &S->h[4] ); |
|
+ row2h = LOADU( &S->h[6] ); |
|
+ row3l = LOADU( &blake2b_IV[0] ); |
|
+ row3h = LOADU( &blake2b_IV[2] ); |
|
+ row4l = _mm_xor_si128( LOADU( &blake2b_IV[4] ), LOADU( &S->t[0] ) ); |
|
+ row4h = _mm_xor_si128( LOADU( &blake2b_IV[6] ), LOADU( &S->f[0] ) ); |
|
ROUND( 0 ); |
|
ROUND( 1 ); |
|
ROUND( 2 ); |
|
@@ -209,14 +207,16 @@ static void blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOC |
|
ROUND( 9 ); |
|
ROUND( 10 ); |
|
ROUND( 11 ); |
|
- |
|
- for( i = 0; i < 8; ++i ) { |
|
- S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; |
|
- } |
|
+ row1l = _mm_xor_si128( row3l, row1l ); |
|
+ row1h = _mm_xor_si128( row3h, row1h ); |
|
+ STOREU( &S->h[0], _mm_xor_si128( LOADU( &S->h[0] ), row1l ) ); |
|
+ STOREU( &S->h[2], _mm_xor_si128( LOADU( &S->h[2] ), row1h ) ); |
|
+ row2l = _mm_xor_si128( row4l, row2l ); |
|
+ row2h = _mm_xor_si128( row4h, row2h ); |
|
+ STOREU( &S->h[4], _mm_xor_si128( LOADU( &S->h[4] ), row2l ) ); |
|
+ STOREU( &S->h[6], _mm_xor_si128( LOADU( &S->h[6] ), row2h ) ); |
|
} |
|
|
|
-#undef G |
|
-#undef ROUND |
|
|
|
int blake2b_update( blake2b_state *S, const void *pin, size_t inlen ) |
|
{ |
|
@@ -245,11 +245,9 @@ int blake2b_update( blake2b_state *S, const void *pin, size_t inlen ) |
|
return 0; |
|
} |
|
|
|
+ |
|
int blake2b_final( blake2b_state *S, void *out, size_t outlen ) |
|
{ |
|
- uint8_t buffer[BLAKE2B_OUTBYTES] = {0}; |
|
- size_t i; |
|
- |
|
if( out == NULL || outlen < S->outlen ) |
|
return -1; |
|
|
|
@@ -261,15 +259,11 @@ int blake2b_final( blake2b_state *S, void *out, size_t outlen ) |
|
memset( S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */ |
|
blake2b_compress( S, S->buf ); |
|
|
|
- for( i = 0; i < 8; ++i ) /* Output full hash to temp buffer */ |
|
- store64( buffer + sizeof( S->h[i] ) * i, S->h[i] ); |
|
- |
|
- memcpy( out, buffer, S->outlen ); |
|
- secure_zero_memory(buffer, sizeof(buffer)); |
|
+ memcpy( out, &S->h[0], S->outlen ); |
|
return 0; |
|
} |
|
|
|
-/* inlen, at least, should be uint64_t. Others can be size_t. */ |
|
+ |
|
int blake2b( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ) |
|
{ |
|
blake2b_state S[1]; |
|
@@ -285,7 +279,7 @@ int blake2b( void *out, size_t outlen, const void *in, size_t inlen, const void |
|
|
|
if( keylen > BLAKE2B_KEYBYTES ) return -1; |
|
|
|
- if( keylen > 0 ) |
|
+ if( keylen ) |
|
{ |
|
if( blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1; |
|
} |
|
diff --git a/src/blake2/blake2b-round.h b/src/blake2/blake2b-round.h |
|
new file mode 100644 |
|
index 0000000..6537fff |
|
--- /dev/null |
|
+++ b/src/blake2/blake2b-round.h |
|
@@ -0,0 +1,157 @@ |
|
+/* |
|
+ BLAKE2 reference source code package - optimized C implementations |
|
+ |
|
+ Copyright 2012, Samuel Neves <[email protected]>. You may use this under the |
|
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at |
|
+ your option. The terms of these licenses can be found at: |
|
+ |
|
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 |
|
+ - OpenSSL license : https://www.openssl.org/source/license.html |
|
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 |
|
+ |
|
+ More information about the BLAKE2 hash function can be found at |
|
+ https://blake2.net. |
|
+*/ |
|
+#ifndef BLAKE2B_ROUND_H |
|
+#define BLAKE2B_ROUND_H |
|
+ |
|
+#define LOADU(p) _mm_loadu_si128( (const __m128i *)(p) ) |
|
+#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r) |
|
+ |
|
+#define TOF(reg) _mm_castsi128_ps((reg)) |
|
+#define TOI(reg) _mm_castps_si128((reg)) |
|
+ |
|
+#define LIKELY(x) __builtin_expect((x),1) |
|
+ |
|
+ |
|
+/* Microarchitecture-specific macros */ |
|
+#ifndef HAVE_XOP |
|
+#ifdef HAVE_SSSE3 |
|
+#define _mm_roti_epi64(x, c) \ |
|
+ (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ |
|
+ : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \ |
|
+ : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ |
|
+ : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \ |
|
+ : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c)))) |
|
+#else |
|
+#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-(c)) )) |
|
+#endif |
|
+#else |
|
+/* ... */ |
|
+#endif |
|
+ |
|
+ |
|
+ |
|
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ |
|
+ row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ |
|
+ row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ |
|
+ \ |
|
+ row4l = _mm_xor_si128(row4l, row1l); \ |
|
+ row4h = _mm_xor_si128(row4h, row1h); \ |
|
+ \ |
|
+ row4l = _mm_roti_epi64(row4l, -32); \ |
|
+ row4h = _mm_roti_epi64(row4h, -32); \ |
|
+ \ |
|
+ row3l = _mm_add_epi64(row3l, row4l); \ |
|
+ row3h = _mm_add_epi64(row3h, row4h); \ |
|
+ \ |
|
+ row2l = _mm_xor_si128(row2l, row3l); \ |
|
+ row2h = _mm_xor_si128(row2h, row3h); \ |
|
+ \ |
|
+ row2l = _mm_roti_epi64(row2l, -24); \ |
|
+ row2h = _mm_roti_epi64(row2h, -24); \ |
|
+ |
|
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ |
|
+ row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ |
|
+ row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ |
|
+ \ |
|
+ row4l = _mm_xor_si128(row4l, row1l); \ |
|
+ row4h = _mm_xor_si128(row4h, row1h); \ |
|
+ \ |
|
+ row4l = _mm_roti_epi64(row4l, -16); \ |
|
+ row4h = _mm_roti_epi64(row4h, -16); \ |
|
+ \ |
|
+ row3l = _mm_add_epi64(row3l, row4l); \ |
|
+ row3h = _mm_add_epi64(row3h, row4h); \ |
|
+ \ |
|
+ row2l = _mm_xor_si128(row2l, row3l); \ |
|
+ row2h = _mm_xor_si128(row2h, row3h); \ |
|
+ \ |
|
+ row2l = _mm_roti_epi64(row2l, -63); \ |
|
+ row2h = _mm_roti_epi64(row2h, -63); \ |
|
+ |
|
+#if defined(HAVE_SSSE3) |
|
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ |
|
+ t0 = _mm_alignr_epi8(row2h, row2l, 8); \ |
|
+ t1 = _mm_alignr_epi8(row2l, row2h, 8); \ |
|
+ row2l = t0; \ |
|
+ row2h = t1; \ |
|
+ \ |
|
+ t0 = row3l; \ |
|
+ row3l = row3h; \ |
|
+ row3h = t0; \ |
|
+ \ |
|
+ t0 = _mm_alignr_epi8(row4h, row4l, 8); \ |
|
+ t1 = _mm_alignr_epi8(row4l, row4h, 8); \ |
|
+ row4l = t1; \ |
|
+ row4h = t0; |
|
+ |
|
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ |
|
+ t0 = _mm_alignr_epi8(row2l, row2h, 8); \ |
|
+ t1 = _mm_alignr_epi8(row2h, row2l, 8); \ |
|
+ row2l = t0; \ |
|
+ row2h = t1; \ |
|
+ \ |
|
+ t0 = row3l; \ |
|
+ row3l = row3h; \ |
|
+ row3h = t0; \ |
|
+ \ |
|
+ t0 = _mm_alignr_epi8(row4l, row4h, 8); \ |
|
+ t1 = _mm_alignr_epi8(row4h, row4l, 8); \ |
|
+ row4l = t1; \ |
|
+ row4h = t0; |
|
+#else |
|
+ |
|
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ |
|
+ t0 = row4l;\ |
|
+ t1 = row2l;\ |
|
+ row4l = row3l;\ |
|
+ row3l = row3h;\ |
|
+ row3h = row4l;\ |
|
+ row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \ |
|
+ row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \ |
|
+ row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \ |
|
+ row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)) |
|
+ |
|
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ |
|
+ t0 = row3l;\ |
|
+ row3l = row3h;\ |
|
+ row3h = t0;\ |
|
+ t0 = row2l;\ |
|
+ t1 = row4l;\ |
|
+ row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \ |
|
+ row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \ |
|
+ row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \ |
|
+ row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)) |
|
+ |
|
+#endif |
|
+ |
|
+#if defined(HAVE_SSE41) |
|
+#include "blake2b-load-sse41.h" |
|
+#else |
|
+#include "blake2b-load-sse2.h" |
|
+#endif |
|
+ |
|
+#define ROUND(r) \ |
|
+ LOAD_MSG_ ##r ##_1(b0, b1); \ |
|
+ G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
|
+ LOAD_MSG_ ##r ##_2(b0, b1); \ |
|
+ G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
|
+ DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ |
|
+ LOAD_MSG_ ##r ##_3(b0, b1); \ |
|
+ G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
|
+ LOAD_MSG_ ##r ##_4(b0, b1); \ |
|
+ G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
|
+ UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); |
|
+ |
|
+#endif |