Created
November 23, 2019 02:18
-
-
Save sir-wabbit/f55c301ccec3b2f7fa85daa8781a2de7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#if __ENDIAN_LITTLE__ | |
# define NATIVE_LITTLE_ENDIAN 1 | |
#endif | |
#if __LITTLE_ENDIAN__ | |
# define NATIVE_LITTLE_ENDIAN 1 | |
#endif | |
#define BLAKE2S_BLOCKBYTES 64 | |
typedef unsigned long uint64_t; | |
typedef signed long int64_t; | |
typedef unsigned int uint32_t; | |
typedef signed int int32_t; | |
typedef unsigned short uint16_t; | |
typedef signed short int16_t; | |
typedef unsigned char uint8_t; | |
typedef signed char int8_t; | |
inline uint32_t rotr32( const uint32_t w, const unsigned c ) { | |
return ( w >> c ) | ( w << ( 32 - c ) ); | |
} | |
#define SWAP32(x) (( uint32_t )( x & 0xFF000000 ) >> 24) | \ | |
(( uint32_t )( x & 0x00FF0000 ) >> 8) | \ | |
(( uint32_t )( x & 0x0000FF00 ) << 8) | \ | |
(( uint32_t )( x & 0x000000FF ) << 24) | |
#if defined(NATIVE_LITTLE_ENDIAN) | |
# define LOAD32(dst, src) do { dst = (*(const global uint32_t*)(src)); } while(0) | |
# define STORE32(dst, src) do { (*(global uint32_t*)(dst)) = src; } while(0) | |
#else | |
# define LOAD32(dst, src) do { \ | |
dst = (*(const global uint32_t*)src); \ | |
dst = SWAP32(dst); \ | |
} while(0) | |
# define STORE32(dst, src) do { \ | |
uint32_t tmp = src; \ | |
tmp = SWAP32(tmp); \ | |
(*(global uint32_t*)(dst)) = tmp; \ | |
} while(0) | |
#endif | |
#define IV0 0x6A09E667UL | |
#define IV1 0xBB67AE85UL | |
#define IV2 0x3C6EF372UL | |
#define IV3 0xA54FF53AUL | |
#define IV4 0x510E527FUL | |
#define IV5 0x9B05688CUL | |
#define IV6 0x1F83D9ABUL | |
#define IV7 0x5BE0CD19UL | |
#define IV(r) IV ## r | |
#define Z00 0 | |
#define Z01 1 | |
#define Z02 2 | |
#define Z03 3 | |
#define Z04 4 | |
#define Z05 5 | |
#define Z06 6 | |
#define Z07 7 | |
#define Z08 8 | |
#define Z09 9 | |
#define Z0A A | |
#define Z0B B | |
#define Z0C C | |
#define Z0D D | |
#define Z0E E | |
#define Z0F F | |
#define Z10 E | |
#define Z11 A | |
#define Z12 4 | |
#define Z13 8 | |
#define Z14 9 | |
#define Z15 F | |
#define Z16 D | |
#define Z17 6 | |
#define Z18 1 | |
#define Z19 C | |
#define Z1A 0 | |
#define Z1B 2 | |
#define Z1C B | |
#define Z1D 7 | |
#define Z1E 5 | |
#define Z1F 3 | |
#define Z20 B | |
#define Z21 8 | |
#define Z22 C | |
#define Z23 0 | |
#define Z24 5 | |
#define Z25 2 | |
#define Z26 F | |
#define Z27 D | |
#define Z28 A | |
#define Z29 E | |
#define Z2A 3 | |
#define Z2B 6 | |
#define Z2C 7 | |
#define Z2D 1 | |
#define Z2E 9 | |
#define Z2F 4 | |
#define Z30 7 | |
#define Z31 9 | |
#define Z32 3 | |
#define Z33 1 | |
#define Z34 D | |
#define Z35 C | |
#define Z36 B | |
#define Z37 E | |
#define Z38 2 | |
#define Z39 6 | |
#define Z3A 5 | |
#define Z3B A | |
#define Z3C 4 | |
#define Z3D 0 | |
#define Z3E F | |
#define Z3F 8 | |
#define Z40 9 | |
#define Z41 0 | |
#define Z42 5 | |
#define Z43 7 | |
#define Z44 2 | |
#define Z45 4 | |
#define Z46 A | |
#define Z47 F | |
#define Z48 E | |
#define Z49 1 | |
#define Z4A B | |
#define Z4B C | |
#define Z4C 6 | |
#define Z4D 8 | |
#define Z4E 3 | |
#define Z4F D | |
#define Z50 2 | |
#define Z51 C | |
#define Z52 6 | |
#define Z53 A | |
#define Z54 0 | |
#define Z55 B | |
#define Z56 8 | |
#define Z57 3 | |
#define Z58 4 | |
#define Z59 D | |
#define Z5A 7 | |
#define Z5B 5 | |
#define Z5C F | |
#define Z5D E | |
#define Z5E 1 | |
#define Z5F 9 | |
#define Z60 C | |
#define Z61 5 | |
#define Z62 1 | |
#define Z63 F | |
#define Z64 E | |
#define Z65 D | |
#define Z66 4 | |
#define Z67 A | |
#define Z68 0 | |
#define Z69 7 | |
#define Z6A 6 | |
#define Z6B 3 | |
#define Z6C 9 | |
#define Z6D 2 | |
#define Z6E 8 | |
#define Z6F B | |
#define Z70 D | |
#define Z71 B | |
#define Z72 7 | |
#define Z73 E | |
#define Z74 C | |
#define Z75 1 | |
#define Z76 3 | |
#define Z77 9 | |
#define Z78 5 | |
#define Z79 0 | |
#define Z7A F | |
#define Z7B 4 | |
#define Z7C 8 | |
#define Z7D 6 | |
#define Z7E 2 | |
#define Z7F A | |
#define Z80 6 | |
#define Z81 F | |
#define Z82 E | |
#define Z83 9 | |
#define Z84 B | |
#define Z85 3 | |
#define Z86 0 | |
#define Z87 8 | |
#define Z88 C | |
#define Z89 2 | |
#define Z8A D | |
#define Z8B 7 | |
#define Z8C 1 | |
#define Z8D 4 | |
#define Z8E A | |
#define Z8F 5 | |
#define Z90 A | |
#define Z91 2 | |
#define Z92 8 | |
#define Z93 4 | |
#define Z94 7 | |
#define Z95 6 | |
#define Z96 1 | |
#define Z97 5 | |
#define Z98 F | |
#define Z99 B | |
#define Z9A 9 | |
#define Z9B E | |
#define Z9C 3 | |
#define Z9D C | |
#define Z9E D | |
#define Z9F 0 | |
#define Mx(r, i) Mx_(Z ## r ## i) | |
#define Mx_(n) Mx__(n) | |
#define Mx__(n) M ## n | |
#define G(m0, m1, a,b,c,d) \ | |
do { \ | |
a = a + b + (m0); \ | |
d = rotr32(d ^ a, 16); \ | |
c = c + d; \ | |
b = rotr32(b ^ c, 12); \ | |
a = a + b + (m1); \ | |
d = rotr32(d ^ a, 8); \ | |
c = c + d; \ | |
b = rotr32(b ^ c, 7); \ | |
} while(0) | |
#define ROUND(r) do { \ | |
G(Mx(r, 0), Mx(r, 1), V0, V4, V8, VC); \ | |
G(Mx(r, 2), Mx(r, 3), V1, V5, V9, VD); \ | |
G(Mx(r, 4), Mx(r, 5), V2, V6, VA, VE); \ | |
G(Mx(r, 6), Mx(r, 7), V3, V7, VB, VF); \ | |
G(Mx(r, 8), Mx(r, 9), V0, V5, VA, VF); \ | |
G(Mx(r, A), Mx(r, B), V1, V6, VB, VC); \ | |
G(Mx(r, C), Mx(r, D), V2, V7, V8, VD); \ | |
G(Mx(r, E), Mx(r, F), V3, V4, V9, VE); \ | |
} while(0) | |
#define DO_COMPRESS(m0, m1, block) do { \ | |
LOAD32(M0, m0); \ | |
LOAD32(M1, m1); \ | |
LOAD32(M2, block + 8-8); \ | |
LOAD32(M3, block + 12-8); \ | |
LOAD32(M4, block + 16-8); \ | |
LOAD32(M5, block + 20-8); \ | |
LOAD32(M6, block + 24-8); \ | |
LOAD32(M7, block + 28-8); \ | |
LOAD32(M8, block + 32-8); \ | |
LOAD32(M9, block + 36-8); \ | |
LOAD32(MA, block + 40-8); \ | |
LOAD32(MB, block + 44-8); \ | |
LOAD32(MC, block + 48-8); \ | |
LOAD32(MD, block + 52-8); \ | |
LOAD32(ME, block + 56-8); \ | |
LOAD32(MF, block + 60-8); \ | |
V0 = H0; V1 = H1; V2 = H2; V3 = H3; \ | |
V4 = H4; V5 = H5; V6 = H6; V7 = H7; \ | |
V8 = IV0; V9 = IV1; VA = IV2; VB = IV3; \ | |
VC = T0 ^ IV4; VD = T1 ^ IV5; VE = F0 ^ IV6; VF = F1 ^ IV7; \ | |
ROUND(0); \ | |
ROUND(1); \ | |
ROUND(2); \ | |
ROUND(3); \ | |
ROUND(4); \ | |
ROUND(5); \ | |
ROUND(6); \ | |
ROUND(7); \ | |
ROUND(8); \ | |
ROUND(9); \ | |
H0 = H0 ^ V0 ^ V8;\ | |
H1 = H1 ^ V1 ^ V9;\ | |
H2 = H2 ^ V2 ^ VA;\ | |
H3 = H3 ^ V3 ^ VB;\ | |
H4 = H4 ^ V4 ^ VC;\ | |
H5 = H5 ^ V5 ^ VD;\ | |
H6 = H6 ^ V6 ^ VE;\ | |
H7 = H7 ^ V7 ^ VF;\ | |
} while (0) | |
#define DO_COMPRESS_SIMPLE(buf) DO_COMPRESS(buf+0, buf+4, buf+8) | |
// static inline void blake2s_run320(uint64_t nonce, uint8_t buf[320-8], uint8_t hash[32]) { | |
// uint32_t H0, H1, H2, H3, H4, H5, H6, H7; | |
// H0 = 0x6b08e647UL; | |
// H1 = IV(1); | |
// H2 = IV(2); | |
// H3 = IV(3); | |
// H4 = IV(4); | |
// H5 = IV(5); | |
// H6 = IV(6); | |
// H7 = IV(7); | |
// uint32_t T0 = 0, T1 = 0, F0 = 0, F1 = 0; | |
// uint32_t M0, M1, M2, M3, M4, M5, M6, M7; | |
// uint32_t M8, M9, MA, MB, MC, MD, ME, MF; | |
// uint32_t V0, V1, V2, V3, V4, V5, V6, V7; | |
// uint32_t V8, V9, VA, VB, VC, VD, VE, VF; | |
// uint8_t * nonce_ptr = (uint8_t*)(&nonce); | |
// // blake2s_update(S, nonce_ptr, 8); | |
// // blake2s_update(S, buf, 64-8); | |
// T0 += BLAKE2S_BLOCKBYTES; | |
// T1 += ( T0 < BLAKE2S_BLOCKBYTES ); | |
// DO_COMPRESS(LOAD32(nonce_ptr), LOAD32(nonce_ptr+4), buf); | |
// // blake2s_update(S, buf-8+64, 64); | |
// T0 += BLAKE2S_BLOCKBYTES; | |
// T1 += ( T0 < BLAKE2S_BLOCKBYTES ); | |
// DO_COMPRESS_SIMPLE(buf-8+64); | |
// // blake2s_update(S, buf-8+128, 64); | |
// T0 += BLAKE2S_BLOCKBYTES; | |
// T1 += ( T0 < BLAKE2S_BLOCKBYTES ); | |
// DO_COMPRESS_SIMPLE(buf-8+128); | |
// // blake2s_update(S, buf-8+192, 64); | |
// T0 += BLAKE2S_BLOCKBYTES; | |
// T1 += ( T0 < BLAKE2S_BLOCKBYTES ); | |
// DO_COMPRESS_SIMPLE(buf-8+192); | |
// // blake2s_update(S, buf-8+256, 64); | |
// // assert(S->buflen == 0); | |
// // printf("%d", S->buflen); | |
// // blake2s_increment_counter(S, 64); | |
// F0 = -1; | |
// T0 += BLAKE2S_BLOCKBYTES; | |
// T1 += ( T0 < BLAKE2S_BLOCKBYTES ); | |
// DO_COMPRESS_SIMPLE(buf-8+256); | |
// // blake2s_final(S, hash, BLAKE2S_OUTBYTES); | |
// store32(hash + 0, H0); | |
// store32(hash + 4, H1); | |
// store32(hash + 8, H2); | |
// store32(hash + 12, H3); | |
// store32(hash + 16, H4); | |
// store32(hash + 20, H5); | |
// store32(hash + 24, H6); | |
// store32(hash + 28, H7); | |
// } | |
kernel void run320( | |
global uint64_t* nonce, | |
global uint8_t* buf, | |
global uint8_t* hash | |
) { | |
uint32_t H0, H1, H2, H3, H4, H5, H6, H7; | |
H0 = 0x6b08e647UL; | |
H1 = IV(1); | |
H2 = IV(2); | |
H3 = IV(3); | |
H4 = IV(4); | |
H5 = IV(5); | |
H6 = IV(6); | |
H7 = IV(7); | |
uint32_t T0 = 0, T1 = 0, F0 = 0, F1 = 0; | |
uint32_t M0, M1, M2, M3, M4, M5, M6, M7; | |
uint32_t M8, M9, MA, MB, MC, MD, ME, MF; | |
uint32_t V0, V1, V2, V3, V4, V5, V6, V7; | |
uint32_t V8, V9, VA, VB, VC, VD, VE, VF; | |
global uint8_t * nonce_ptr = (global uint8_t*)(nonce); | |
// blake2s_update(S, nonce_ptr, 8); | |
// blake2s_update(S, buf, 64-8); | |
T0 += BLAKE2S_BLOCKBYTES; | |
T1 += ( T0 < BLAKE2S_BLOCKBYTES ); | |
DO_COMPRESS(nonce_ptr, nonce_ptr+4, buf); | |
// blake2s_update(S, buf-8+64, 64); | |
T0 += BLAKE2S_BLOCKBYTES; | |
T1 += ( T0 < BLAKE2S_BLOCKBYTES ); | |
DO_COMPRESS_SIMPLE(buf-8+64); | |
// blake2s_update(S, buf-8+128, 64); | |
T0 += BLAKE2S_BLOCKBYTES; | |
T1 += ( T0 < BLAKE2S_BLOCKBYTES ); | |
DO_COMPRESS_SIMPLE(buf-8+128); | |
// blake2s_update(S, buf-8+192, 64); | |
T0 += BLAKE2S_BLOCKBYTES; | |
T1 += ( T0 < BLAKE2S_BLOCKBYTES ); | |
DO_COMPRESS_SIMPLE(buf-8+192); | |
// blake2s_update(S, buf-8+256, 64); | |
// assert(S->buflen == 0); | |
// printf("%d", S->buflen); | |
// blake2s_increment_counter(S, 64); | |
F0 = -1; | |
T0 += BLAKE2S_BLOCKBYTES; | |
T1 += ( T0 < BLAKE2S_BLOCKBYTES ); | |
DO_COMPRESS_SIMPLE(buf-8+256); | |
// blake2s_final(S, hash, BLAKE2S_OUTBYTES); | |
STORE32(hash + 0, H0); | |
STORE32(hash + 4, H1); | |
STORE32(hash + 8, H2); | |
STORE32(hash + 12, H3); | |
STORE32(hash + 16, H4); | |
STORE32(hash + 20, H5); | |
STORE32(hash + 24, H6); | |
STORE32(hash + 28, H7); | |
} | |
kernel void search_nonce( | |
global uint64_t* start_nonce, | |
uint64_t work_set, | |
global uint8_t* buf, | |
global uint32_t* target_hash, | |
global uint8_t* result_ptr | |
) { | |
size_t gid = get_global_id(0); | |
uint64_t nonce0 = *start_nonce + gid * work_set; | |
// printf("gid=%d\n", gid); | |
// printf("start_nonce=%lld\n", *start_nonce); | |
// printf("work_set=%lld\n", work_set); | |
// printf("buf={%d, %d, %d, %d, ...}\n", buf[0], buf[1], buf[2], buf[3]); | |
// printf("target_hash={%d, %d, %d, %d, ...}\n", target_hash[0], target_hash[1], target_hash[2], target_hash[3]); | |
uint32_t target[8]; | |
for (int i = 0; i < 8; i++) target[i] = target_hash[i]; | |
for (uint64_t i = 0; i < work_set; i++) { | |
uint64_t nonce[1] = { nonce0 + i }; | |
uint32_t H0, H1, H2, H3, H4, H5, H6, H7; | |
H0 = 0x6b08e647UL; | |
H1 = IV(1); | |
H2 = IV(2); | |
H3 = IV(3); | |
H4 = IV(4); | |
H5 = IV(5); | |
H6 = IV(6); | |
H7 = IV(7); | |
uint32_t T0 = 0, T1 = 0, F0 = 0, F1 = 0; | |
uint32_t M0, M1, M2, M3, M4, M5, M6, M7; | |
uint32_t M8, M9, MA, MB, MC, MD, ME, MF; | |
uint32_t V0, V1, V2, V3, V4, V5, V6, V7; | |
uint32_t V8, V9, VA, VB, VC, VD, VE, VF; | |
uint8_t * nonce_ptr = (uint8_t*)(nonce); | |
// blake2s_update(S, nonce_ptr, 8); | |
// blake2s_update(S, buf, 64-8); | |
T0 += BLAKE2S_BLOCKBYTES; | |
T1 += ( T0 < BLAKE2S_BLOCKBYTES ); | |
DO_COMPRESS(nonce_ptr, nonce_ptr+4, buf); | |
// blake2s_update(S, buf-8+64, 64); | |
T0 += BLAKE2S_BLOCKBYTES; | |
T1 += ( T0 < BLAKE2S_BLOCKBYTES ); | |
DO_COMPRESS_SIMPLE(buf-8+64); | |
// blake2s_update(S, buf-8+128, 64); | |
T0 += BLAKE2S_BLOCKBYTES; | |
T1 += ( T0 < BLAKE2S_BLOCKBYTES ); | |
DO_COMPRESS_SIMPLE(buf-8+128); | |
// blake2s_update(S, buf-8+192, 64); | |
T0 += BLAKE2S_BLOCKBYTES; | |
T1 += ( T0 < BLAKE2S_BLOCKBYTES ); | |
DO_COMPRESS_SIMPLE(buf-8+192); | |
// blake2s_update(S, buf-8+256, 64); | |
// assert(S->buflen == 0); | |
// printf("%d", S->buflen); | |
// blake2s_increment_counter(S, 64); | |
F0 = -1; | |
T0 += BLAKE2S_BLOCKBYTES; | |
T1 += ( T0 < BLAKE2S_BLOCKBYTES ); | |
DO_COMPRESS_SIMPLE(buf-8+256); | |
// blake2s_final(S, hash, BLAKE2S_OUTBYTES); | |
uint32_t hash[8] = { H0, H1, H2, H3, H4, H5, H6, H7 }; | |
// printf("%lld %llx %llx\n", nonce[0], ((uint64_t*) target)[3], ((uint64_t*) &hash)[3]); | |
uint8_t result = 0; | |
for (int i = 0; i < 4; i++) { | |
uint64_t h1 = ((uint64_t*) target)[3 - i]; | |
uint64_t h2 = ((uint64_t*) &hash)[3 - i]; | |
if (h1 > h2) { | |
result = 1; | |
break; | |
} else if (h1 == h2) continue; | |
else { | |
result = -1; | |
break; | |
} | |
} | |
result_ptr[gid * work_set + i] = result; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment