Last active
September 26, 2023 08:34
-
-
Save danny8376/a2e3ea9e9ef42eeca4cb364ac82f975e to your computer and use it in GitHub Desktop.
sha256_12 for mii lfcs hash
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* sha256_16/12 | |
* again specialized to only take 16/12 bytes input and spit out the first 16/last 8 bytes | |
* again code dug out from mbed TLS | |
* https://github.com/ARMmbed/mbedtls/blob/development/library/sha256.c | |
*/ | |
// adopted from: https://github.com/Jimmy-Z/bfCL/blob/master/cl/sha256_16.cl | |
#include <stdint.h> | |
const uint32_t K[] = | |
{ | |
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, | |
0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, | |
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, | |
0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, | |
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, | |
0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, | |
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, | |
0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, | |
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, | |
0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, | |
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, | |
0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, | |
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, | |
0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, | |
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, | |
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2, | |
}; | |
#define SHR(x,n) ((x & 0xFFFFFFFF) >> n) | |
#define ROTR(x,n) (SHR(x,n) | (x << (32 - n))) | |
#define S0(x) (ROTR(x, 7) ^ ROTR(x,18) ^ SHR(x, 3)) | |
#define S1(x) (ROTR(x,17) ^ ROTR(x,19) ^ SHR(x,10)) | |
#define S2(x) (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22)) | |
#define S3(x) (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25)) | |
#define F0(x,y,z) ((x & y) | (z & (x | y))) | |
#define F1(x,y,z) (z ^ (x & (y ^ z))) | |
#define R(t) \ | |
( \ | |
W[t] = S1(W[t - 2]) + W[t - 7] + \ | |
S0(W[t - 15]) + W[t - 16] \ | |
) | |
#define P(a,b,c,d,e,f,g,h,x,K) \ | |
{ \ | |
temp1 = h + S3(e) + F1(e,f,g) + K + x; \ | |
temp2 = S2(a) + F0(a,b,c); \ | |
d += temp1; h = temp1 + temp2; \ | |
} | |
void sha256_12(uint32_t *src, uint32_t *hash) // uint32_t[3] src, uint32_t[2] hash | |
{ | |
uint32_t temp1, temp2, W[64]; | |
uint32_t A[8] = { | |
0x6A09E667, | |
0xBB67AE85, | |
0x3C6EF372, | |
0xA54FF53A, | |
0x510E527F, | |
0x9B05688C, | |
0x1F83D9AB, | |
0x5BE0CD19 | |
}; | |
unsigned int i; | |
// padding and msglen identical/similar to sha1_16 | |
W[0] = src[0]; | |
W[1] = src[1]; | |
W[2] = src[2]; | |
W[3] = 0x80000000u; | |
W[4] = 0; | |
W[5] = 0; W[6] = 0; W[7] = 0; | |
W[8] = 0; W[9] = 0; W[10] = 0; W[11] = 0; | |
W[12] = 0; W[13] = 0; W[14] = 0; | |
W[15] = 0x60u; | |
for (i = 0; i < 16; i += 8) | |
{ | |
P(A[0], A[1], A[2], A[3], A[4], A[5], A[6], A[7], W[i + 0], K[i + 0]); | |
P(A[7], A[0], A[1], A[2], A[3], A[4], A[5], A[6], W[i + 1], K[i + 1]); | |
P(A[6], A[7], A[0], A[1], A[2], A[3], A[4], A[5], W[i + 2], K[i + 2]); | |
P(A[5], A[6], A[7], A[0], A[1], A[2], A[3], A[4], W[i + 3], K[i + 3]); | |
P(A[4], A[5], A[6], A[7], A[0], A[1], A[2], A[3], W[i + 4], K[i + 4]); | |
P(A[3], A[4], A[5], A[6], A[7], A[0], A[1], A[2], W[i + 5], K[i + 5]); | |
P(A[2], A[3], A[4], A[5], A[6], A[7], A[0], A[1], W[i + 6], K[i + 6]); | |
P(A[1], A[2], A[3], A[4], A[5], A[6], A[7], A[0], W[i + 7], K[i + 7]); | |
} | |
for (i = 16; i < 64; i += 8) | |
{ | |
P(A[0], A[1], A[2], A[3], A[4], A[5], A[6], A[7], R(i + 0), K[i + 0]); | |
P(A[7], A[0], A[1], A[2], A[3], A[4], A[5], A[6], R(i + 1), K[i + 1]); | |
P(A[6], A[7], A[0], A[1], A[2], A[3], A[4], A[5], R(i + 2), K[i + 2]); | |
P(A[5], A[6], A[7], A[0], A[1], A[2], A[3], A[4], R(i + 3), K[i + 3]); | |
P(A[4], A[5], A[6], A[7], A[0], A[1], A[2], A[3], R(i + 4), K[i + 4]); | |
P(A[3], A[4], A[5], A[6], A[7], A[0], A[1], A[2], R(i + 5), K[i + 5]); | |
P(A[2], A[3], A[4], A[5], A[6], A[7], A[0], A[1], R(i + 6), K[i + 6]); | |
P(A[1], A[2], A[3], A[4], A[5], A[6], A[7], A[0], R(i + 7), K[i + 7]); | |
} | |
A[6] += 0x1F83D9AB; | |
A[7] += 0x5BE0CD19; | |
hash[0] = A[6]; | |
hash[1] = A[7]; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from functools import reduce | |
# ref: https://github.com/Jimmy-Z/bfCL/blob/master/cl/sha256_16.cl | |
A=[ | |
0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a, | |
0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19 | |
] | |
K=[ | |
0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5, | |
0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, | |
0xd807aa98,0x12835b01,0x243185be,0x550c7dc3, | |
0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, | |
0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc, | |
0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, | |
0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7, | |
0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, | |
0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13, | |
0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, | |
0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3, | |
0xd192e819,0xd6990624,0xf40e3585,0x106aa070, | |
0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5, | |
0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, | |
0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208, | |
0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | |
] | |
def mod32add(*vals): | |
return reduce(lambda x, y: x+y, vals) & 0xFFFFFFFF | |
def shr(v, n): | |
return (v & 0xFFFFFFFF) >> n | |
def ror(v, n): | |
return shr(v,n) | ((v << (32 - n)) & 0xFFFFFFFF) | |
def f0(x,y,z): | |
return ((x & y) | (z & (x | y))) | |
def f1(x,y,z): | |
return (z ^ (x & (y ^ z))) | |
def s0(x): | |
return ror(x,7) ^ ror(x,18) ^ shr(x,3) | |
def s1(x): | |
return ror(x,17) ^ ror(x,19) ^ shr(x,10) | |
def s2(x): | |
return ror(x,2) ^ ror(x,13) ^ ror(x,22) | |
def s3(x): | |
return ror(x,6) ^ ror(x,11) ^ ror(x,25) | |
def sha256_12(input): | |
w=[*map(lambda b: int.from_bytes(b, "big"), [input[i:i+4] for i in range(0, 12, 4)])] | |
w+=[0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x60] | |
a,b,c,d,e,f,g,h=A | |
for i in range(0,64): | |
if i < 16: | |
t1=mod32add(h,s3(e),f1(e,f,g),K[i],w[i]) | |
t2=mod32add(s2(a),f0(a,b,c)) | |
a,b,c,d,e,f,g,h=mod32add(t1,t2),a,b,c,mod32add(d,t1),e,f,g | |
else: | |
w.append(mod32add(s1(w[i-2]),w[i-7],s0(w[i-15]),w[i-16])) | |
t1=mod32add(h,s3(e),f1(e,f,g),K[i],w[i]) | |
t2=mod32add(s2(a),f0(a,b,c)) | |
a,b,c,d,e,f,g,h=mod32add(t1,t2),a,b,c,mod32add(d,t1),e,f,g | |
return "{0:08x}{1:08x}".format(mod32add(0x1F83D9AB,g), mod32add(0x5BE0CD19,h)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* sha256-x86.c - Intel SHA extensions using C intrinsics */ | |
/* Written and place in public domain by Jeffrey Walton */ | |
/* Based on code from Intel, and by Sean Gulley for */ | |
/* the miTLS project. */ | |
// adopted from: https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-x86.c | |
// ref: https://github.com/Jimmy-Z/bfCL/blob/master/cl/sha256_16.cl | |
// gcc -DTEST_MAIN -msse4.1 -msha sha256-x86.c -o sha256.exe | |
// Include the GCC super header | |
#if defined(__GNUC__) | |
# include <stdint.h> | |
# include <x86intrin.h> | |
#endif | |
// Microsoft supports Intel SHA ACLE extensions as of Visual Studio 2015 | |
#if defined(_MSC_VER) | |
# include <immintrin.h> | |
# define WIN32_LEAN_AND_MEAN | |
# include <Windows.h> | |
typedef UINT32 uint32_t; | |
typedef UINT8 uint8_t; | |
#endif | |
#include <string.h> | |
static const uint8_t DATA12[16] __attribute__ ((aligned (16))) = { | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 | |
}; | |
static const uint8_t DATA3[16] __attribute__ ((aligned (16))) = { | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00 | |
}; | |
static const uint8_t INIT[][16] __attribute__ ((aligned (16))) = { | |
{ 0x8C, 0x68, 0x05, 0x9B, 0x7F, 0x52, 0x0E, 0x51, 0x85, 0xAE, 0x67, 0xBB, 0x67, 0xE6, 0x09, 0x6A }, | |
{ 0x19, 0xCD, 0xE0, 0x5B, 0xAB, 0xD9, 0x83, 0x1F, 0x3A, 0xF5, 0x4F, 0xA5, 0x72, 0xF3, 0x6E, 0x3C } | |
}; | |
static const uint8_t SHAC[][16] __attribute__ ((aligned (16))) = { | |
{ 0x98, 0x2F, 0x8A, 0x42, 0x91, 0x44, 0x37, 0x71, 0xCF, 0xFB, 0xC0, 0xB5, 0xA5, 0xDB, 0xB5, 0xE9 }, | |
{ 0x5B, 0xC2, 0x56, 0x39, 0xF1, 0x11, 0xF1, 0x59, 0xA4, 0x82, 0x3F, 0x92, 0xD5, 0x5E, 0x1C, 0xAB }, | |
{ 0x98, 0xAA, 0x07, 0xD8, 0x01, 0x5B, 0x83, 0x12, 0xBE, 0x85, 0x31, 0x24, 0xC3, 0x7D, 0x0C, 0x55 }, | |
{ 0x74, 0x5D, 0xBE, 0x72, 0xFE, 0xB1, 0xDE, 0x80, 0xA7, 0x06, 0xDC, 0x9B, 0x74, 0xF1, 0x9B, 0xC1 }, | |
{ 0xC1, 0x69, 0x9B, 0xE4, 0x86, 0x47, 0xBE, 0xEF, 0xC6, 0x9D, 0xC1, 0x0F, 0xCC, 0xA1, 0x0C, 0x24 }, | |
{ 0x6F, 0x2C, 0xE9, 0x2D, 0xAA, 0x84, 0x74, 0x4A, 0xDC, 0xA9, 0xB0, 0x5C, 0xDA, 0x88, 0xF9, 0x76 }, | |
{ 0x52, 0x51, 0x3E, 0x98, 0x6D, 0xC6, 0x31, 0xA8, 0xC8, 0x27, 0x03, 0xB0, 0xC7, 0x7F, 0x59, 0xBF }, | |
{ 0xF3, 0x0B, 0xE0, 0xC6, 0x47, 0x91, 0xA7, 0xD5, 0x51, 0x63, 0xCA, 0x06, 0x67, 0x29, 0x29, 0x14 }, | |
{ 0x85, 0x0A, 0xB7, 0x27, 0x38, 0x21, 0x1B, 0x2E, 0xFC, 0x6D, 0x2C, 0x4D, 0x13, 0x0D, 0x38, 0x53 }, | |
{ 0x54, 0x73, 0x0A, 0x65, 0xBB, 0x0A, 0x6A, 0x76, 0x2E, 0xC9, 0xC2, 0x81, 0x85, 0x2C, 0x72, 0x92 }, | |
{ 0xA1, 0xE8, 0xBF, 0xA2, 0x4B, 0x66, 0x1A, 0xA8, 0x70, 0x8B, 0x4B, 0xC2, 0xA3, 0x51, 0x6C, 0xC7 }, | |
{ 0x19, 0xE8, 0x92, 0xD1, 0x24, 0x06, 0x99, 0xD6, 0x85, 0x35, 0x0E, 0xF4, 0x70, 0xA0, 0x6A, 0x10 }, | |
{ 0x16, 0xC1, 0xA4, 0x19, 0x08, 0x6C, 0x37, 0x1E, 0x4C, 0x77, 0x48, 0x27, 0xB5, 0xBC, 0xB0, 0x34 }, | |
{ 0xB3, 0x0C, 0x1C, 0x39, 0x4A, 0xAA, 0xD8, 0x4E, 0x4F, 0xCA, 0x9C, 0x5B, 0xF3, 0x6F, 0x2E, 0x68 }, | |
{ 0xEE, 0x82, 0x8F, 0x74, 0x6F, 0x63, 0xA5, 0x78, 0x14, 0x78, 0xC8, 0x84, 0x08, 0x02, 0xC7, 0x8C }, | |
{ 0xFA, 0xFF, 0xBE, 0x90, 0xEB, 0x6C, 0x50, 0xA4, 0xF7, 0xA3, 0xF9, 0xBE, 0xF2, 0x78, 0x71, 0xC6 } | |
}; | |
static const uint8_t SWAPC[][16] __attribute__ ((aligned (16))) = { | |
0x19, 0xCD, 0xE0, 0x5B, 0xAB, 0xD9, 0x83, 0x1F, 0x3A, 0xF5, 0x4F, 0xA5, 0x72, 0xf3, 0x6E, 0x3C | |
}; | |
void sha256_12_shaext(const uint8_t *seed, uint8_t *hash) { | |
uint8_t DATA0[16] __attribute__ ((aligned (8))) = { | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 | |
}; | |
memcpy(DATA0, seed, 12); | |
register __m128i STATE0 asm ("xmm3"), STATE1 asm ("xmm4"); | |
register __m128i MSG asm ("xmm5"), TMP asm ("xmm6"); | |
register __m128i MSG0 asm ("xmm7"), MSG1 asm ("xmm8"), MSG2 asm ("xmm9"), MSG3 asm ("xmm10"); | |
// Load initial values, pre shuffled | |
STATE0 = *((__m128i*)INIT[0]); | |
STATE1 = *((__m128i*)INIT[1]); | |
// Rounds 0-3 | |
MSG0 = *((__m128i*)&DATA0); | |
MSG = _mm_add_epi32(MSG0, *((__m128i*)SHAC[0])); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
// Rounds 4-7 | |
MSG1 = *((__m128i*)&DATA12); | |
MSG = _mm_add_epi32(MSG1, *((__m128i*)SHAC[1])); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); | |
// Rounds 8-11 | |
MSG2 = *((__m128i*)&DATA12); | |
MSG = _mm_add_epi32(MSG2, *((__m128i*)SHAC[2])); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); | |
// Rounds 12-15 | |
MSG3 = *((__m128i*)&DATA3); | |
MSG = _mm_add_epi32(MSG3, *((__m128i*)SHAC[3])); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG3, MSG2, 4); | |
MSG0 = _mm_add_epi32(MSG0, TMP); | |
MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); | |
// Rounds 16-19 | |
MSG = _mm_add_epi32(MSG0, *((__m128i*)SHAC[4])); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG0, MSG3, 4); | |
MSG1 = _mm_add_epi32(MSG1, TMP); | |
MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); | |
// Rounds 20-23 | |
MSG = _mm_add_epi32(MSG1, *((__m128i*)SHAC[5])); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG1, MSG0, 4); | |
MSG2 = _mm_add_epi32(MSG2, TMP); | |
MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); | |
// Rounds 24-27 | |
MSG = _mm_add_epi32(MSG2, *((__m128i*)SHAC[6])); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG2, MSG1, 4); | |
MSG3 = _mm_add_epi32(MSG3, TMP); | |
MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); | |
// Rounds 28-31 | |
MSG = _mm_add_epi32(MSG3, *((__m128i*)SHAC[7])); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG3, MSG2, 4); | |
MSG0 = _mm_add_epi32(MSG0, TMP); | |
MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); | |
// Rounds 32-35 | |
MSG = _mm_add_epi32(MSG0, *((__m128i*)SHAC[8])); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG0, MSG3, 4); | |
MSG1 = _mm_add_epi32(MSG1, TMP); | |
MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); | |
// Rounds 36-39 | |
MSG = _mm_add_epi32(MSG1, *((__m128i*)SHAC[9])); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG1, MSG0, 4); | |
MSG2 = _mm_add_epi32(MSG2, TMP); | |
MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); | |
// Rounds 40-43 | |
MSG = _mm_add_epi32(MSG2, *((__m128i*)SHAC[10])); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG2, MSG1, 4); | |
MSG3 = _mm_add_epi32(MSG3, TMP); | |
MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); | |
// Rounds 44-47 | |
MSG = _mm_add_epi32(MSG3, *((__m128i*)SHAC[11])); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG3, MSG2, 4); | |
MSG0 = _mm_add_epi32(MSG0, TMP); | |
MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); | |
// Rounds 48-51 | |
MSG = _mm_add_epi32(MSG0, *((__m128i*)SHAC[12])); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG0, MSG3, 4); | |
MSG1 = _mm_add_epi32(MSG1, TMP); | |
MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); | |
// Rounds 52-55 | |
MSG = _mm_add_epi32(MSG1, *((__m128i*)SHAC[13])); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG1, MSG0, 4); | |
MSG2 = _mm_add_epi32(MSG2, TMP); | |
MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
// Rounds 56-59 | |
MSG = _mm_add_epi32(MSG2, *((__m128i*)SHAC[14])); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG2, MSG1, 4); | |
MSG3 = _mm_add_epi32(MSG3, TMP); | |
MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
// Rounds 60-63 | |
MSG = _mm_add_epi32(MSG3, *((__m128i*)SHAC[15])); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
// Combine state | |
STATE1 = _mm_add_epi32(STATE1, *((__m128i*)&SWAPC)); | |
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA | |
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG | |
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA | |
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF | |
// Output | |
*((uint64_t*)hash) = _mm_extract_epi64(STATE1, 1); | |
*((uint32_t*)hash) = (uint32_t)__builtin_bswap32(*((uint32_t*)hash)); | |
*((uint32_t*)(hash+4)) = (uint32_t)__builtin_bswap32(*((uint32_t*)(hash+4))); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.intel_syntax noprefix | |
# adopted from: https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-x86.c | |
# ref: https://github.com/Jimmy-Z/bfCL/blob/master/cl/sha256_16.cl | |
#ifdef __APPLE__ | |
#define _func(name) \ | |
.global _##name ; \ | |
_##name: | |
#else | |
#define _func(name) \ | |
.global name ; \ | |
name: | |
#endif | |
#ifdef _WIN32 | |
#define rp1 rcx | |
#define rp2 rdx | |
#define rp3 r8 | |
#define rp4 r9 | |
#define ep1 ecx | |
#define ep2 edx | |
#define ep3 r8d | |
#define ep4 r9d | |
#define p1 cx | |
#define p2 dx | |
#define p3 r8w | |
#define p4 r9w | |
#define p5ptr [rbp+0x48] | |
#else | |
#define rp1 rdi | |
#define rp2 rsi | |
#define rp3 rdx | |
#define rp4 rcx | |
#define ep1 edi | |
#define ep2 esi | |
#define ep3 edx | |
#define ep4 ecx | |
#define p1 di | |
#define p2 si | |
#define p3 dx | |
#define p4 cx | |
#define p5ptr [r8] | |
#endif | |
#define _xmmd(name) xmmword ptr [rip+name] | |
#define _qdat(name) qword ptr [rip+name] | |
#define _ddat(name) dword ptr [rip+name] | |
#define _xmm2sp(offset, regn) \ | |
movdqa [rsp+16*offset], xmm##regn ; | |
#define _sp2xmm(regn, offset) \ | |
movdqa xmm##regn, [rsp+16*offset] ; | |
#define _shasr1(s0, s1, tmpr, regt, msg, k) \ | |
movdqa regt, msg ; \ | |
paddd regt, _xmmd(k) ; \ | |
sha256rnds2 s1, s0, regt ; \ | |
pshufd regt, regt, 0xE ; \ | |
sha256rnds2 s0, s1, regt ; | |
#define _shasr2(s0, s1, tmpr, regt, msg, pmsg, k) \ | |
movdqa regt, msg ; \ | |
paddd regt, _xmmd(k) ; \ | |
sha256rnds2 s1, s0, regt ; \ | |
pshufd regt, regt, 0xE ; \ | |
sha256rnds2 s0, s1, regt ; \ | |
sha256msg1 pmsg, msg ; | |
#define _shasr3(s0, s1, tmpr, regt, regt2, msg, pmsg, nmsg, k) \ | |
movdqa regt, msg ; \ | |
paddd regt, _xmmd(k) ; \ | |
sha256rnds2 s1, s0, regt ; \ | |
movdqa regt2, msg ; \ | |
palignr regt2, pmsg, 4 ; \ | |
paddd nmsg, regt2 ; \ | |
sha256msg2 nmsg, msg ; \ | |
pshufd regt, regt, 0xE ; \ | |
sha256rnds2 s0, s1, regt ; \ | |
sha256msg1 pmsg, msg ; | |
.data | |
.align 8 | |
CD: .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 | |
.align 16 | |
D12: .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 | |
D3: .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00 | |
I0: .byte 0x8C, 0x68, 0x05, 0x9B, 0x7F, 0x52, 0x0E, 0x51, 0x85, 0xAE, 0x67, 0xBB, 0x67, 0xE6, 0x09, 0x6A | |
I1: .byte 0x19, 0xCD, 0xE0, 0x5B, 0xAB, 0xD9, 0x83, 0x1F, 0x3A, 0xF5, 0x4F, 0xA5, 0x72, 0xF3, 0x6E, 0x3C | |
C0: .byte 0x98, 0x2F, 0x8A, 0x42, 0x91, 0x44, 0x37, 0x71, 0xCF, 0xFB, 0xC0, 0xB5, 0xA5, 0xDB, 0xB5, 0xE9 | |
C1: .byte 0x5B, 0xC2, 0x56, 0x39, 0xF1, 0x11, 0xF1, 0x59, 0xA4, 0x82, 0x3F, 0x92, 0xD5, 0x5E, 0x1C, 0xAB | |
C2: .byte 0x98, 0xAA, 0x07, 0xD8, 0x01, 0x5B, 0x83, 0x12, 0xBE, 0x85, 0x31, 0x24, 0xC3, 0x7D, 0x0C, 0x55 | |
C3: .byte 0x74, 0x5D, 0xBE, 0x72, 0xFE, 0xB1, 0xDE, 0x80, 0xA7, 0x06, 0xDC, 0x9B, 0x74, 0xF1, 0x9B, 0xC1 | |
C4: .byte 0xC1, 0x69, 0x9B, 0xE4, 0x86, 0x47, 0xBE, 0xEF, 0xC6, 0x9D, 0xC1, 0x0F, 0xCC, 0xA1, 0x0C, 0x24 | |
C5: .byte 0x6F, 0x2C, 0xE9, 0x2D, 0xAA, 0x84, 0x74, 0x4A, 0xDC, 0xA9, 0xB0, 0x5C, 0xDA, 0x88, 0xF9, 0x76 | |
C6: .byte 0x52, 0x51, 0x3E, 0x98, 0x6D, 0xC6, 0x31, 0xA8, 0xC8, 0x27, 0x03, 0xB0, 0xC7, 0x7F, 0x59, 0xBF | |
C7: .byte 0xF3, 0x0B, 0xE0, 0xC6, 0x47, 0x91, 0xA7, 0xD5, 0x51, 0x63, 0xCA, 0x06, 0x67, 0x29, 0x29, 0x14 | |
C8: .byte 0x85, 0x0A, 0xB7, 0x27, 0x38, 0x21, 0x1B, 0x2E, 0xFC, 0x6D, 0x2C, 0x4D, 0x13, 0x0D, 0x38, 0x53 | |
C9: .byte 0x54, 0x73, 0x0A, 0x65, 0xBB, 0x0A, 0x6A, 0x76, 0x2E, 0xC9, 0xC2, 0x81, 0x85, 0x2C, 0x72, 0x92 | |
C10: .byte 0xA1, 0xE8, 0xBF, 0xA2, 0x4B, 0x66, 0x1A, 0xA8, 0x70, 0x8B, 0x4B, 0xC2, 0xA3, 0x51, 0x6C, 0xC7 | |
C11: .byte 0x19, 0xE8, 0x92, 0xD1, 0x24, 0x06, 0x99, 0xD6, 0x85, 0x35, 0x0E, 0xF4, 0x70, 0xA0, 0x6A, 0x10 | |
C12: .byte 0x16, 0xC1, 0xA4, 0x19, 0x08, 0x6C, 0x37, 0x1E, 0x4C, 0x77, 0x48, 0x27, 0xB5, 0xBC, 0xB0, 0x34 | |
C13: .byte 0xB3, 0x0C, 0x1C, 0x39, 0x4A, 0xAA, 0xD8, 0x4E, 0x4F, 0xCA, 0x9C, 0x5B, 0xF3, 0x6F, 0x2E, 0x68 | |
C14: .byte 0xEE, 0x82, 0x8F, 0x74, 0x6F, 0x63, 0xA5, 0x78, 0x14, 0x78, 0xC8, 0x84, 0x08, 0x02, 0xC7, 0x8C | |
C15: .byte 0xFA, 0xFF, 0xBE, 0x90, 0xEB, 0x6C, 0x50, 0xA4, 0xF7, 0xA3, 0xF9, 0xBE, 0xF2, 0x78, 0x71, 0xC6 | |
S: .byte 0x19, 0xCD, 0xE0, 0x5B, 0xAB, 0xD9, 0x83, 0x1F, 0x3A, 0xF5, 0x4F, 0xA5, 0x72, 0xf3, 0x6E, 0x3C | |
.text | |
# ---- volatile ---- | |
#define DAT0 xmm3 | |
# ---- non-volatile ---- | |
#define STATE0 xmm10 | |
#define STATE1 xmm11 | |
#define MSG0 xmm12 | |
#define MSG1 xmm13 | |
#define MSG2 xmm14 | |
#define MSG3 xmm15 | |
_func(sha256_12) # uint64_t data => uint64_t hash | |
push rbp | |
mov rbp, rsp | |
# store xmm10-15 x6 | |
lea rsp, [rsp-16*6] | |
_xmm2sp(0, 10) | |
_xmm2sp(1, 11) | |
_xmm2sp(2, 12) | |
_xmm2sp(3, 13) | |
_xmm2sp(4, 14) | |
_xmm2sp(5, 15) | |
# -------------------------------- | |
# | prepare data | |
# DAT0 | |
movq DAT0, rp1 | |
pinsrq DAT0, _qdat(CD), 1 | |
# -------------------------------- | |
# | actual sha256_12 hashing | |
# init state, pre shuffled | |
movdqa STATE0, _xmmd(I0) | |
movdqa STATE1, _xmmd(I1) | |
# rounds 0-3 | |
movdqa MSG0, DAT0 | |
_shasr1 (STATE0, STATE1, rax, xmm0, MSG0, C0) | |
# rounds 4-7 | |
movdqa MSG1, _xmmd(D12) | |
_shasr2 (STATE0, STATE1, rax, xmm0, MSG1, MSG0, C1) | |
# rounds 8-11 | |
movdqa MSG2, _xmmd(D12) | |
_shasr2 (STATE0, STATE1, rax, xmm0, MSG2, MSG1, C2) | |
# rounds 12-15 | |
movdqa MSG3, _xmmd(D3) | |
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG3, MSG2, MSG0, C3) | |
# rounds 16-19 | |
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG0, MSG3, MSG1, C4) | |
# rounds 20-23 | |
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG1, MSG0, MSG2, C5) | |
# rounds 24-27 | |
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG2, MSG1, MSG3, C6) | |
# rounds 28-31 | |
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG3, MSG2, MSG0, C7) | |
# rounds 32-35 | |
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG0, MSG3, MSG1, C8) | |
# rounds 36-39 | |
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG1, MSG0, MSG2, C9) | |
# rounds 40-43 | |
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG2, MSG1, MSG3, C10) | |
# rounds 44-47 | |
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG3, MSG2, MSG0, C11) | |
# rounds 48-51 | |
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG0, MSG3, MSG1, C12) | |
# rounds 52-55 | |
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG1, MSG0, MSG2, C13) | |
# rounds 56-59 | |
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG2, MSG1, MSG3, C14) | |
# rounds 60-63 | |
_shasr1 (STATE0, STATE1, rax, xmm0, MSG3, C15) | |
# combine state | |
paddd STATE1, _xmmd(S) | |
pshufd xmm0, STATE0, 0x1B | |
pshufd STATE1, STATE1, 0xB1 | |
movdqa STATE0, xmm0 | |
pblendw STATE0, STATE1, 0xF0 | |
palignr STATE1, xmm0, 8 | |
# result hash in high 64bit of STATE1 (swapped double dword) | |
pextrq rax, STATE1, 1 | |
_sp2xmm(10, 0) | |
_sp2xmm(11, 1) | |
_sp2xmm(12, 2) | |
_sp2xmm(13, 3) | |
_sp2xmm(14, 4) | |
_sp2xmm(15, 5) | |
mov rsp, rbp | |
pop rbp | |
ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment