Last active
January 4, 2021 20:47
-
-
Save michael1011/29bd44ccf06d981b470614b0657bbfbc to your computer and use it in GitHub Desktop.
SHA256 benchmark for x86 with Intel SHA extensions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* sha256-x86.c - Intel SHA extensions using C intrinsics */ | |
/* Written and place in public domain by Jeffrey Walton */ | |
/* Based on code from Intel, and by Sean Gulley for */ | |
/* the miTLS project. */ | |
/* gcc -pthread -msse4.1 -msha sha265.c -O3 -o sha256 */ | |
/* Include the GCC super header */ | |
#if defined(__GNUC__) | |
# include <stdint.h> | |
#include <unistd.h> | |
# include <x86intrin.h> | |
#include <pthread.h> | |
#include <stdatomic.h> | |
#endif | |
/* Microsoft supports Intel SHA ACLE extensions as of Visual Studio 2015 */ | |
#if defined(_MSC_VER) | |
# include <immintrin.h> | |
# define WIN32_LEAN_AND_MEAN | |
# include <Windows.h> | |
typedef UINT32 uint32_t; | |
typedef UINT8 uint8_t; | |
#endif | |
/* Process multiple blocks. The caller is responsible for setting the initial */ | |
/* state, and the caller is responsible for padding the final block. */ | |
void sha256_process_x86(uint32_t state[8], const uint8_t data[], uint32_t length) | |
{ | |
__m128i STATE0, STATE1; | |
__m128i MSG, TMP; | |
__m128i MSG0, MSG1, MSG2, MSG3; | |
__m128i ABEF_SAVE, CDGH_SAVE; | |
const __m128i MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); | |
/* Load initial values */ | |
TMP = _mm_loadu_si128((const __m128i*) &state[0]); | |
STATE1 = _mm_loadu_si128((const __m128i*) &state[4]); | |
TMP = _mm_shuffle_epi32(TMP, 0xB1); /* CDAB */ | |
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); /* EFGH */ | |
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); /* ABEF */ | |
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); /* CDGH */ | |
while (length >= 64) | |
{ | |
/* Save current state */ | |
ABEF_SAVE = STATE0; | |
CDGH_SAVE = STATE1; | |
/* Rounds 0-3 */ | |
MSG = _mm_loadu_si128((const __m128i*) (data+0)); | |
MSG0 = _mm_shuffle_epi8(MSG, MASK); | |
MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
/* Rounds 4-7 */ | |
MSG1 = _mm_loadu_si128((const __m128i*) (data+16)); | |
MSG1 = _mm_shuffle_epi8(MSG1, MASK); | |
MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); | |
/* Rounds 8-11 */ | |
MSG2 = _mm_loadu_si128((const __m128i*) (data+32)); | |
MSG2 = _mm_shuffle_epi8(MSG2, MASK); | |
MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); | |
/* Rounds 12-15 */ | |
MSG3 = _mm_loadu_si128((const __m128i*) (data+48)); | |
MSG3 = _mm_shuffle_epi8(MSG3, MASK); | |
MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG3, MSG2, 4); | |
MSG0 = _mm_add_epi32(MSG0, TMP); | |
MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); | |
/* Rounds 16-19 */ | |
MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL)); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG0, MSG3, 4); | |
MSG1 = _mm_add_epi32(MSG1, TMP); | |
MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); | |
/* Rounds 20-23 */ | |
MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL)); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG1, MSG0, 4); | |
MSG2 = _mm_add_epi32(MSG2, TMP); | |
MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); | |
/* Rounds 24-27 */ | |
MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL)); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG2, MSG1, 4); | |
MSG3 = _mm_add_epi32(MSG3, TMP); | |
MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); | |
/* Rounds 28-31 */ | |
MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL)); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG3, MSG2, 4); | |
MSG0 = _mm_add_epi32(MSG0, TMP); | |
MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); | |
/* Rounds 32-35 */ | |
MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL)); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG0, MSG3, 4); | |
MSG1 = _mm_add_epi32(MSG1, TMP); | |
MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); | |
/* Rounds 36-39 */ | |
MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL)); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG1, MSG0, 4); | |
MSG2 = _mm_add_epi32(MSG2, TMP); | |
MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); | |
/* Rounds 40-43 */ | |
MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL)); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG2, MSG1, 4); | |
MSG3 = _mm_add_epi32(MSG3, TMP); | |
MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); | |
/* Rounds 44-47 */ | |
MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL)); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG3, MSG2, 4); | |
MSG0 = _mm_add_epi32(MSG0, TMP); | |
MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); | |
/* Rounds 48-51 */ | |
MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL)); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG0, MSG3, 4); | |
MSG1 = _mm_add_epi32(MSG1, TMP); | |
MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); | |
/* Rounds 52-55 */ | |
MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL)); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG1, MSG0, 4); | |
MSG2 = _mm_add_epi32(MSG2, TMP); | |
MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
/* Rounds 56-59 */ | |
MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL)); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
TMP = _mm_alignr_epi8(MSG2, MSG1, 4); | |
MSG3 = _mm_add_epi32(MSG3, TMP); | |
MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
/* Rounds 60-63 */ | |
MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL)); | |
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); | |
MSG = _mm_shuffle_epi32(MSG, 0x0E); | |
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); | |
/* Combine state */ | |
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); | |
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); | |
data += 64; | |
length -= 64; | |
} | |
TMP = _mm_shuffle_epi32(STATE0, 0x1B); /* FEBA */ | |
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); /* DCHG */ | |
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); /* DCBA */ | |
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); /* ABEF */ | |
/* Save state */ | |
_mm_storeu_si128((__m128i*) &state[0], STATE0); | |
_mm_storeu_si128((__m128i*) &state[4], STATE1); | |
} | |
#include <stdio.h> | |
#include <string.h> | |
#include <stdatomic.h> | |
#include <time.h> | |
#include <limits.h> | |
void* sha256(void *thread_num_void) { | |
int thread_num = *((int *) thread_num_void); | |
int counter = 0; | |
int start_time = time(NULL); | |
/* empty message with padding */ | |
uint8_t message[64]; | |
memset(message, 0x21, sizeof(message)); | |
message[0] = 0x80; | |
/* initial state */ | |
uint32_t state[8] = { | |
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, | |
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 | |
}; | |
for (;;) { | |
sha256_process_x86(state, message, sizeof(message)); | |
const uint8_t b1 = (uint8_t)(state[0] >> 24); | |
const uint8_t b2 = (uint8_t)(state[0] >> 16); | |
const uint8_t b3 = (uint8_t)(state[0] >> 8); | |
const uint8_t b4 = (uint8_t)(state[0] >> 0); | |
const uint8_t b5 = (uint8_t)(state[1] >> 24); | |
const uint8_t b6 = (uint8_t)(state[1] >> 16); | |
const uint8_t b7 = (uint8_t)(state[1] >> 8); | |
const uint8_t b8 = (uint8_t)(state[1] >> 0); | |
counter++; | |
if (counter % 10000 == 0) { | |
if (time(NULL) > start_time + 1) { | |
printf("Thread %d: MHash / Second: %d\n", thread_num, (counter / 1000) / 1000 / 2); | |
counter = 0; | |
start_time = time(NULL); | |
} | |
} | |
} | |
} | |
int main(int argc, char* argv[]) | |
{ | |
int NUM_THREADS = 24; | |
printf("Using %d threads\n\n", NUM_THREADS); | |
pthread_t threads[NUM_THREADS]; | |
for (int ii; ii < NUM_THREADS; ii++) { | |
int *arg = malloc(sizeof(*arg)); | |
*arg = ii; | |
pthread_create(&threads[ii], NULL, sha256, arg); | |
} | |
for (;;) { | |
usleep(1000 * 1000); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Oh damn. I forgot 😅
Fixed.
My god. Only checking that every 10k iterations gives it a ~2.5x performance increase.
As I said, I am a C noob and coming from the JavaScript world where performance is... not as big of a concern....
View this as a proof of concept benchmark 😉