Created
April 3, 2021 16:35
-
-
Save tarcieri/414a3300072160f372b5d93ccfce280b to your computer and use it in GitHub Desktop.
core::simd-based SHA-256 implementation using ARMv8 Cryptography Extensions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#![feature(stdsimd)] | |
// Based on the following C intrinsics implementation: | |
// <https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c> | |
// | |
// Original C written and placed in public domain by Jeffrey Walton. | |
// Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and | |
// Barry O'Rourke for the mbedTLS project. | |
use core::arch::aarch64::*; | |
const K: [u32; 64] = [ | |
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, | |
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, | |
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, | |
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, | |
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, | |
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, | |
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, | |
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2, | |
]; | |
pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { | |
// Load state | |
let mut state0 = vld1q_u32(&state[0]); | |
let mut state1 = vld1q_u32(&state[4]); | |
for block in blocks { | |
// Save state | |
let abef_save = state0; | |
let cdgh_save = state1; | |
// Load message | |
let mut msg = [ | |
vld1q_u32(block[..16].as_ptr() as *const u32), | |
vld1q_u32(block[16..32].as_ptr() as *const u32), | |
vld1q_u32(block[32..48].as_ptr() as *const u32), | |
vld1q_u32(block[48..].as_ptr() as *const u32), | |
]; | |
// Reverse for little endian | |
for i in 0..4 { | |
// TODO(tarcieri): figure out why `vreinterpretq_u8_u32` is missing on aarch64 | |
// msg[i] = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(msg[i]))); | |
let m = unsafe { *(&msg[i] as *const uint32x4_t as *const uint8x16_t) }; | |
msg[i] = vreinterpretq_u32_u8(vrev32q_u8(m)); | |
} | |
let mut tmp0 = vaddq_u32(msg[0], vld1q_u32(&K[0x00])); | |
let mut tmp1: uint32x4_t; | |
let mut tmp2: uint32x4_t; | |
// Rounds 0-3 | |
msg[0] = vsha256su0q_u32(msg[0], msg[1]); | |
tmp2 = state0; | |
tmp1 = vaddq_u32(msg[1], vld1q_u32(&K[0x04])); | |
state0 = vsha256hq_u32(state0, state1, tmp0); | |
state1 = vsha256h2q_u32(state1, tmp2, tmp0); | |
msg[0] = vsha256su1q_u32(msg[0], msg[2], msg[3]); | |
// Rounds 4-7 | |
msg[1] = vsha256su0q_u32(msg[1], msg[2]); | |
tmp2 = state0; | |
tmp0 = vaddq_u32(msg[2], vld1q_u32(&K[0x08])); | |
state0 = vsha256hq_u32(state0, state1, tmp1); | |
state1 = vsha256h2q_u32(state1, tmp2, tmp1); | |
msg[1] = vsha256su1q_u32(msg[1], msg[3], msg[0]); | |
// Rounds 8-11 | |
msg[2] = vsha256su0q_u32(msg[2], msg[3]); | |
tmp2 = state0; | |
tmp1 = vaddq_u32(msg[3], vld1q_u32(&K[0x0c])); | |
state0 = vsha256hq_u32(state0, state1, tmp0); | |
state1 = vsha256h2q_u32(state1, tmp2, tmp0); | |
msg[2] = vsha256su1q_u32(msg[2], msg[0], msg[1]); | |
// Rounds 12-15 | |
msg[3] = vsha256su0q_u32(msg[3], msg[0]); | |
tmp2 = state0; | |
tmp0 = vaddq_u32(msg[0], vld1q_u32(&K[0x10])); | |
state0 = vsha256hq_u32(state0, state1, tmp1); | |
state1 = vsha256h2q_u32(state1, tmp2, tmp1); | |
msg[3] = vsha256su1q_u32(msg[3], msg[1], msg[2]); | |
// Rounds 16-19 | |
msg[0] = vsha256su0q_u32(msg[0], msg[1]); | |
tmp2 = state0; | |
tmp1 = vaddq_u32(msg[1], vld1q_u32(&K[0x14])); | |
state0 = vsha256hq_u32(state0, state1, tmp0); | |
state1 = vsha256h2q_u32(state1, tmp2, tmp0); | |
msg[0] = vsha256su1q_u32(msg[0], msg[2], msg[3]); | |
// Rounds 20-23 | |
msg[1] = vsha256su0q_u32(msg[1], msg[2]); | |
tmp2 = state0; | |
tmp0 = vaddq_u32(msg[2], vld1q_u32(&K[0x18])); | |
state0 = vsha256hq_u32(state0, state1, tmp1); | |
state1 = vsha256h2q_u32(state1, tmp2, tmp1); | |
msg[1] = vsha256su1q_u32(msg[1], msg[3], msg[0]); | |
// Rounds 24-27 | |
msg[2] = vsha256su0q_u32(msg[2], msg[3]); | |
tmp2 = state0; | |
tmp1 = vaddq_u32(msg[3], vld1q_u32(&K[0x1c])); | |
state0 = vsha256hq_u32(state0, state1, tmp0); | |
state1 = vsha256h2q_u32(state1, tmp2, tmp0); | |
msg[2] = vsha256su1q_u32(msg[2], msg[0], msg[1]); | |
// Rounds 28-31 | |
msg[3] = vsha256su0q_u32(msg[3], msg[0]); | |
tmp2 = state0; | |
tmp0 = vaddq_u32(msg[0], vld1q_u32(&K[0x20])); | |
state0 = vsha256hq_u32(state0, state1, tmp1); | |
state1 = vsha256h2q_u32(state1, tmp2, tmp1); | |
msg[3] = vsha256su1q_u32(msg[3], msg[1], msg[2]); | |
// Rounds 32-35 | |
msg[0] = vsha256su0q_u32(msg[0], msg[1]); | |
tmp2 = state0; | |
tmp1 = vaddq_u32(msg[1], vld1q_u32(&K[0x24])); | |
state0 = vsha256hq_u32(state0, state1, tmp0); | |
state1 = vsha256h2q_u32(state1, tmp2, tmp0); | |
msg[0] = vsha256su1q_u32(msg[0], msg[2], msg[3]); | |
// Rounds 36-39 | |
msg[1] = vsha256su0q_u32(msg[1], msg[2]); | |
tmp2 = state0; | |
tmp0 = vaddq_u32(msg[2], vld1q_u32(&K[0x28])); | |
state0 = vsha256hq_u32(state0, state1, tmp1); | |
state1 = vsha256h2q_u32(state1, tmp2, tmp1); | |
msg[1] = vsha256su1q_u32(msg[1], msg[3], msg[0]); | |
// Rounds 40-43 | |
msg[2] = vsha256su0q_u32(msg[2], msg[3]); | |
tmp2 = state0; | |
tmp1 = vaddq_u32(msg[3], vld1q_u32(&K[0x2c])); | |
state0 = vsha256hq_u32(state0, state1, tmp0); | |
state1 = vsha256h2q_u32(state1, tmp2, tmp0); | |
msg[2] = vsha256su1q_u32(msg[2], msg[0], msg[1]); | |
// Rounds 44-47 | |
msg[3] = vsha256su0q_u32(msg[3], msg[0]); | |
tmp2 = state0; | |
tmp0 = vaddq_u32(msg[0], vld1q_u32(&K[0x30])); | |
state0 = vsha256hq_u32(state0, state1, tmp1); | |
state1 = vsha256h2q_u32(state1, tmp2, tmp1); | |
msg[3] = vsha256su1q_u32(msg[3], msg[1], msg[2]); | |
// Rounds 48-51 | |
tmp2 = state0; | |
tmp1 = vaddq_u32(msg[1], vld1q_u32(&K[0x34])); | |
state0 = vsha256hq_u32(state0, state1, tmp0); | |
state1 = vsha256h2q_u32(state1, tmp2, tmp0); | |
// Rounds 52-55 | |
tmp2 = state0; | |
tmp0 = vaddq_u32(msg[2], vld1q_u32(&K[0x38])); | |
state0 = vsha256hq_u32(state0, state1, tmp1); | |
state1 = vsha256h2q_u32(state1, tmp2, tmp1); | |
// Rounds 56-59 | |
tmp2 = state0; | |
tmp1 = vaddq_u32(msg[3], vld1q_u32(&K[0x3c])); | |
state0 = vsha256hq_u32(state0, state1, tmp0); | |
state1 = vsha256h2q_u32(state1, tmp2, tmp0); | |
// Rounds 60-63 | |
tmp2 = state0; | |
state0 = vsha256hq_u32(state0, state1, tmp1); | |
state1 = vsha256h2q_u32(state1, tmp2, tmp1); | |
// Combine state | |
state0 = vaddq_u32(state0, abef_save); | |
state1 = vaddq_u32(state1, cdgh_save); | |
} | |
// Save state | |
vst1q_u32(&state[0], state0); | |
vst1q_u32(&state[4], state1); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment