Skip to content

Instantly share code, notes, and snippets.

@jevinskie
Created December 12, 2024 15:46
Show Gist options
  • Save jevinskie/d604dcaa38a09f4d14d09c167149dbc8 to your computer and use it in GitHub Desktop.
Save jevinskie/d604dcaa38a09f4d14d09c167149dbc8 to your computer and use it in GitHub Desktop.
AArch64 Advanced SIMD SHA1 digest to ASCII hex string
digest_to_hex:
ldr q0, [x0]
ushr.16b v1, v0, #4
movi.16b v2, #15
and.16b v0, v0, v2
; nibble_hex_lut = { '0', ... '9', 'a' ... 'f' }
adrp x8, nibble_hex_lut@PAGE
ldr q2, [x8, nibble_hex_lut@PAGEOFF]
tbl.16b v1, { v2 }, v1
tbl.16b v0, { v2 }, v0
zip1.16b v2, v1, v0
zip2.16b v3, v1, v0
mov x8, x1
st1.16b { v2, v3 }, [x8], #32
ldr s0, [x0, #16]
ushr.8b v1, v0, #4
movi.8b v2, #15
and.8b v0, v0, v2
zip1.8b v0, v1, v0
movi.8b v1, #48 ; '0'
orr.8b v0, v0, v1
movi.8b v1, #57 ; '9'
cmhi.8b v1, v0, v1
movi.8b v2, #39
and.8b v1, v1, v2
add.8b v0, v1, v0
str d0, [x8]
strb wzr, [x1, #40] ; NUL
ret
; Iterations: 100
; Instructions: 2600
; Total Cycles: 620
; Total uOps: 2800
;
; Dispatch Width: 6
; uOps Per Cycle: 4.52
; IPC: 4.19
; Block RThroughput: 6.0
uint64_t u32_to_hex_ascii_u64(const uint32_t value) {
// Load and reverse bytes to get big-endian ordering in one instruction.
// Original: [B0, B1, B2, B3]
// After vdup_n_u32: [B0,B1,B2,B3, 0,0,0,0]
const uint8x8_t bytes = vreinterpret_u8_u32(vdup_n_u32(value));
// Extract high nibble and low nibble directly:
// high nibble = byte >> 4
// low nibble = byte & 0x0F
// const uint8x8_t nibble_swapped()
const uint8x8_t high = vshr_n_u8(bytes, 4);
const uint8x8_t low = vand_u8(bytes, vdup_n_u8(0x0F));
// Interleave high and low nibbles: [H0,L0,H1,L1,H2,L2,H3,L3,...]
// vzip_u8 takes corresponding elements from high and low, interleaving them.
const uint8x8x2_t zipped = vzip_u8(high, low);
// zipped.val[0] = H0,L0,H1,L1,H2,L2,H3,L3 (first 8 nibbles of interest)
// No need to combine further; zipped.val[0] already holds the 8 nibbles we want.
const uint8x8_t nibbles = zipped.val[0];
// ASCII conversion:
// Add '0' to bring [0..9] into '0'..'9' and [10..15] into ':'..'?'
const uint8x8_t nibbles_ascii_stage_0 = vadd_u8(nibbles, vdup_n_u8('0'));
// Values above '9' ('9' = 0x39) should become 'a'..'f'
// Check which are greater than '9':
const uint8x8_t mask = vcgt_u8(nibbles_ascii_stage_0, vdup_n_u8('9'));
// Add 0x27 ('9' + 1 + 0x27 = 'a') to these values
const uint8x8_t ascii_nibbles = vadd_u8(nibbles_ascii_stage_0, vand_u8(mask, vdup_n_u8(0x27)));
uint64_t res;
vst1_u8(reinterpret_cast<uint8_t *>(&res), ascii_nibbles);
return res;
}
void digest_to_hex(const uint8_t *__restrict digest,
char *__restrict hex_str) {
const uint8x16_t mask_lo = vdupq_n_u8(0x0F); // Mask for low 4 bits
// Load the first 16 bytes of the digest
const uint8x16_t input = vld1q_u8(digest);
const uint8x16_t hi = vshrq_n_u8(input, 4); // Shift high nibbles down
const uint8x16_t lo = vandq_u8(input, mask_lo); // Isolate low nibbles
// Convert to ASCII hex characters
constexpr uint8x16_t lut = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
const uint8x16_t hex_hi = vqtbl1q_u8(lut, hi);
const uint8x16_t hex_lo = vqtbl1q_u8(lut, lo);
// Store the results interleaved
const uint8x16x2_t hex_chars_interleaved = vzipq_u8(hex_hi, hex_lo);
vst1q_u8_x2((to_from_cast<uint8_t *, char *>(hex_str)), hex_chars_interleaved);
// Handle the remaining 4 bytes
const uint32_t remaining_bytes = *reinterpret_cast<const uint32_t *>(digest + 16);
const uint64_t hex_packed = u32_to_hex_ascii_u64(remaining_bytes);
*reinterpret_cast<uint64_t *>(hex_str + 32) = hex_packed;
hex_str[SHA1_OUTPUT_SIZE * 2] = '\0';
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment