Skip to content

Instantly share code, notes, and snippets.

@chfast
Last active April 25, 2019 06:33
Show Gist options
  • Save chfast/6ea6d43d74257b797e3ddb1c1caf0e72 to your computer and use it in GitHub Desktop.
Save chfast/6ea6d43d74257b797e3ddb1c1caf0e72 to your computer and use it in GitHub Desktop.
Analysis of 256-bit mutiplication implementations in context of wasm

All assembly snippets are generated from intx. Compiled by clang 9 (trung), -O3 -march=skylake.

The first one mul256_x86_64.s is the one evmone is using. It exploits the fact that x86 has mul instruction performing full 64x64 -> 128 multiplication. See umul() procedure in intx. This instruction cannot be used by wasm, but I'm working on teaching LLVM to recognize the 64x64 -> 128 mul pattern.

Instruction count:

  • mul: 10
  • add: 14
  • shift: 0
  • mov: 31
  • all: 73

The second one mul256_x86_64_no_mul128.s is the same procedure except it uses umul_generic() procedure to avoid using 64x64 -> 128 mul instruction.

Instruction count:

  • mul: 28
  • add: 38
  • shift: 30
  • mov: 90
  • all: 210
intx::mul256(intx::uint<256u> const&, intx::uint<256u> const&): # @intx::mul256(intx::uint<256u> const&, intx::uint<256u> const&)
push rbp
push r15
push r14
push r13
push r12
push rbx
mov r11, rdx
mov r15, rdi
mov r10, qword ptr [rsi]
mov r12, qword ptr [rsi + 8]
mov rcx, qword ptr [rdx]
mov r9, qword ptr [rdx + 8]
mov rax, rcx
mul r10
mov rdi, rdx
mov qword ptr [rsp - 8], rax # 8-byte Spill
mov rax, r12
mul rcx
mov rbp, rdx
mov r13, rax
mov rax, r9
mul r10
mov rbx, rdx
mov r8, rax
mov rax, r9
mul r12
mov qword ptr [rsp - 24], rdx # 8-byte Spill
lea r14, [rdi + r13]
add r14, r8
adc rbx, rax
add rdi, r13
adc rbp, rbx
mov qword ptr [rsp - 16], rbp # 8-byte Spill
setb byte ptr [rsp - 25] # 1-byte Folded Spill
xor r8d, r8d
cmp rbx, rax
setb r8b
mov rdi, qword ptr [r11 + 16]
mov rax, rdi
mul r10
mov r13, rax
mov rbp, rdx
imul r10, qword ptr [r11 + 24]
imul rdi, r12
add rdi, r10
mov rbx, qword ptr [rsi + 16]
mov rax, rbx
imul rbx, r9
mul rcx
imul rcx, qword ptr [rsi + 24]
add rdi, qword ptr [rsp - 24] # 8-byte Folded Reload
add rdi, rbx
add rdi, rbp
add rdi, rcx
add rdi, rdx
add rax, r13
adc rdi, r8
add rax, qword ptr [rsp - 16] # 8-byte Folded Reload
movzx ecx, byte ptr [rsp - 25] # 1-byte Folded Reload
adc rdi, rcx
mov rcx, qword ptr [rsp - 8] # 8-byte Reload
mov qword ptr [r15], rcx
mov qword ptr [r15 + 8], r14
mov qword ptr [r15 + 16], rax
mov qword ptr [r15 + 24], rdi
mov rax, r15
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
intx::mul256(intx::uint<256u> const&, intx::uint<256u> const&): # @intx::mul256(intx::uint<256u> const&, intx::uint<256u> const&)
push rbp
push r15
push r14
push r13
push r12
push rbx
mov qword ptr [rsp - 8], rdi # 8-byte Spill
mov qword ptr [rsp - 24], rsi # 8-byte Spill
mov rax, qword ptr [rsi]
mov rbx, qword ptr [rsi + 8]
mov r14, rax
mov qword ptr [rsp - 32], rax # 8-byte Spill
mov eax, r14d
shr r14, 32
mov qword ptr [rsp - 104], r14 # 8-byte Spill
mov r15, qword ptr [rdx]
mov r9, rdx
mov qword ptr [rsp - 80], rdx # 8-byte Spill
mov ebp, r15d
mov rdi, r15
shr rdi, 32
mov r8, rbp
mov rdx, rax
mov qword ptr [rsp - 96], rax # 8-byte Spill
imul r8, rax
mov rax, rbp
imul rax, r14
mov rcx, rdi
imul rcx, rdx
mov rdx, rdi
mov r10, rdi
mov qword ptr [rsp - 88], rdi # 8-byte Spill
imul rdx, r14
mov edi, r8d
shr r8, 32
add r8, rax
mov eax, r8d
add rax, rcx
mov rcx, rax
shl rcx, 32
or rdi, rcx
mov qword ptr [rsp - 16], rdi # 8-byte Spill
shr rax, 32
shr r8, 32
add r8, rdx
add r8, rax
mov qword ptr [rsp - 48], rbx # 8-byte Spill
mov r12d, ebx
shr rbx, 32
mov rcx, r12
mov qword ptr [rsp - 40], rbp # 8-byte Spill
imul rcx, rbp
mov rdx, rbx
imul rdx, rbp
mov rsi, rcx
shr rsi, 32
add rsi, rdx
mov rdx, r12
imul rdx, r10
mov r14d, esi
add r14, rdx
mov eax, ecx
mov rcx, r14
shl rcx, 32
or rax, rcx
mov qword ptr [rsp - 112], rax # 8-byte Spill
mov r11, qword ptr [r9 + 8]
mov r13d, r11d
mov qword ptr [rsp - 56], r11 # 8-byte Spill
mov rdx, r13
mov r9, r13
mov rbp, r13
imul r13, rbx
shr r11, 32
imul rbp, r12
imul r12, r11
mov rdi, r11
mov r10, r11
imul r11, rbx
imul rbx, qword ptr [rsp - 88] # 8-byte Folded Reload
shr rsi, 32
add rsi, rbx
mov rax, qword ptr [rsp - 96] # 8-byte Reload
imul rdx, rax
mov rcx, qword ptr [rsp - 104] # 8-byte Reload
imul r9, rcx
mov rbx, rdx
shr rbx, 32
add rbx, r9
imul rdi, rax
mov r9d, ebx
add r9, rdi
mov edi, edx
mov rdx, r9
shl rdx, 32
or rdi, rdx
imul r10, rcx
shr rbx, 32
add rbx, r10
shr r9, 32
add rbx, r9
mov r9, rbp
shr r9, 32
add r9, r13
mov r13d, r9d
add r13, r12
mov ecx, ebp
mov rdx, r13
shl rdx, 32
or rcx, rdx
shr r14, 32
shr r13, 32
shr r9, 32
mov rax, qword ptr [rsp - 112] # 8-byte Reload
add rax, r8
adc rsi, r14
add rax, rdi
mov qword ptr [rsp - 112], rax # 8-byte Spill
adc rbx, rcx
xor eax, eax
cmp rbx, rcx
setb al
mov qword ptr [rsp - 64], rax # 8-byte Spill
xor eax, eax
add rbx, rsi
setb al
mov qword ptr [rsp - 72], rax # 8-byte Spill
mov r10, qword ptr [rsp - 80] # 8-byte Reload
mov rsi, qword ptr [r10 + 16]
mov rax, rsi
shr rax, 32
mov ecx, esi
mov rdx, rcx
mov rdi, qword ptr [rsp - 96] # 8-byte Reload
imul rdx, rdi
mov rbp, qword ptr [rsp - 104] # 8-byte Reload
imul rcx, rbp
imul rdi, rax
mov r8, rdi
imul rax, rbp
mov rdi, rdx
shr rdi, 32
add rdi, rcx
mov r12d, edi
add r12, r8
mov ebp, edx
mov rcx, r12
shl rcx, 32
or rbp, rcx
mov rcx, qword ptr [rsp - 32] # 8-byte Reload
imul rcx, qword ptr [r10 + 24]
imul rsi, qword ptr [rsp - 48] # 8-byte Folded Reload
add rsi, r11
add rsi, rcx
add rsi, rax
mov r10, qword ptr [rsp - 24] # 8-byte Reload
mov rcx, qword ptr [r10 + 16]
mov r8, rcx
shr r8, 32
mov edx, ecx
mov r11, rdx
mov rax, qword ptr [rsp - 40] # 8-byte Reload
imul r11, rax
imul rax, r8
mov r14, rax
mov rax, qword ptr [rsp - 88] # 8-byte Reload
imul rdx, rax
imul r8, rax
mov rax, r11
shr rax, 32
add rax, r14
mov r14d, eax
add r14, rdx
mov r11d, r11d
mov rdx, r14
shl rdx, 32
or r11, rdx
imul rcx, qword ptr [rsp - 56] # 8-byte Folded Reload
add rcx, rsi
add rcx, r9
imul r15, qword ptr [r10 + 24]
add r15, rcx
add r15, r8
shr rdi, 32
add r15, rdi
add r15, r13
shr r12, 32
add r15, r12
shr rax, 32
add r15, rax
shr r14, 32
add r15, r14
add r11, rbp
adc r15, qword ptr [rsp - 64] # 8-byte Folded Reload
add r11, rbx
adc r15, qword ptr [rsp - 72] # 8-byte Folded Reload
mov rax, qword ptr [rsp - 8] # 8-byte Reload
mov rcx, qword ptr [rsp - 16] # 8-byte Reload
mov qword ptr [rax], rcx
mov rcx, qword ptr [rsp - 112] # 8-byte Reload
mov qword ptr [rax + 8], rcx
mov qword ptr [rax + 16], r11
mov qword ptr [rax + 24], r15
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment