Created
October 1, 2018 01:42
-
-
Save Nexuapex/b837cfa19e6db51be0952d7f654e8a9d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Abseil absl::flat_hash_map | |
// Analysis of the critical path for perfectly predicted load hit. | |
// https://godbolt.org/z/P_ksaa | |
// | |
// x86_64 | |
// Instruction latencies have been entirely ignored. | |
# rdi = desired key | |
mov rax, qword ptr [rip + kHashSeed] # [a00] rax = per-process seed (with entropy) | |
add rax, rdi # [a01] rax = hash so far | |
movabs rcx, 0x9DDFEA08EB382D69 # rcx = CityHash magic number | |
mul rcx # [a02] 128-bit multiply rax * rcx... | |
xor rdx, rax # [a03] ...and mix to 64-bits (rdx = hash of key) | |
mov r10, qword ptr [rip + g_map] # [b00] r10 = pointer to metadata bytes | |
mov rax, rdx | |
shr rax, 7 # [a04] rax = hash bits 7-63 | |
mov rcx, r10 | |
shr rcx, 12 # [b01] rcx = page address (ASLR entropy) | |
xor rcx, rax # [a05] rcx = H1 (with added entropy) -- n.b. also b02 | |
mov rsi, qword ptr [rip + g_map+24] # [c00] rsi = power-of-two-minus-one capacity mask | |
and dl, 127 # [d04] dl = H2 (hash bits 0-6) -- n.b. branched from "a" dep chain | |
movzx eax, dl # eax = H2 | |
vmovd xmm0, eax # [d05] xmm0[byte 0] = H2 | |
vpxor xmm1, xmm1, xmm1 | |
vpshufb xmm0, xmm0, xmm1 # [d06] splat byte 0 across all lanes of xmm0 | |
mov r9, qword ptr [rip + g_map+8] # [e00] r9 = pointer to interleaved keys and values | |
xor r8d, r8d | |
and rcx, rsi # [a06] rcx = bucket index (or slot index?) -- n.b. also c01 | |
vpcmpeqb xmm1, xmm0, xmmword ptr [r10 + rcx] # [a07] xmm1 = packed byte compare w/ metadata bytes -- n.b. also d07 | |
vpmovmskb edx, xmm1 # [a08] edx = mask of byte compare results | |
test edx, edx # [a09] did I find any candidates? | |
jz .probe_next_bucket # no candidates, go to next bucket | |
bsf eax, edx # [a09] eax = relative index of first candidate | |
add rax, rcx # [a10] eax = absolute index of first candidate | |
and rax, rsi # [a11] handle wrap-around | |
shl rax, 4 # [a12] rax = byte offset to key (after mul by 16) | |
cmp qword ptr [r9 + rax], rdi # [a13] does the key match desired key? | |
jne .check_next_slot # no match, go to next candidate | |
mov rax, qword ptr [r9 + rax + 8] # [a13] rax = value | |
ret | |
00: LD load seed, metadata ptr, slots ptr | |
01: ALU seed + key | |
02: ALU tmp1 * hash_magic (128-bit) | |
03: ALU tmp2.hi ^ tmp2.lo | |
04: ALU hash >> 7 ALU hash & 127 | |
05: ALU tmp3 ^ ASLR entropy FLT move metadata byte to floating point unit | |
06: ALU tmp4 & capacity_mask FLT splat metadata byte across all lanes | |
07: FLT [load-op] packed byte compare | |
08: FLT extract/move mask back to gpr | |
09: ALU bitscan to find first candidate | |
10: ALU candidate += bucket index | |
11: ALU candidate &= capacity_mask | |
12: ALU candidate *= 16 | |
13: LD [load-op] compare key LD load value | |
14 instructions long. The most expensive is the MUL during hashing. | |
2 dependent loads, although the second (key + value) could possibly be prefetched. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment