Skip to content

Instantly share code, notes, and snippets.

@stepantubanov
Last active January 25, 2021 10:31
Show Gist options
  • Select an option

  • Save stepantubanov/92eac266747064728d84cc252bf06c1e to your computer and use it in GitHub Desktop.

Select an option

Save stepantubanov/92eac266747064728d84cc252bf06c1e to your computer and use it in GitHub Desktop.
ADD vs ADC bigint mul
.intel_syntax noprefix
.global _main
_main:
push rbp
push r12
push r13
push r14
mov rbp, rsp
mov r12d, 300 # R12D is number of 32 byte blocks.
mov r11d, r12d
shl r11d, 5 # R11D = Size of large integer in bytes (R12D * 32).
# 300 x 32 = 9.6 KB (so total 19.2 KB).
sub rsp, r11
sub rsp, r11 # Allocate 2*EAX on stack.
and rsp, -32 # Align on 32-byte boundary.
lea r13, [rsp+r11]
mov r10d, 5000000 # R10D = Number of benchmark loop iterations (5 mil).
mov rdx, 0x12345678 # RDX = Multiplier.
L_bench_loop:
mov rsi, rsp
mov rdi, r13
mov ecx, r12d
# Code to multiply large integer by a qword.
# RSI = input large integer (qword array).
# RDI = output large integer (qword array).
# RDX = qword to multiply the large integer by.
# ECX = number of 32-byte blocks to process (i.e. qwords count / 4).
# RAX = carry
xor eax, eax
.p2align 4
L_mul_word_loop:
mulx r9, r8, [rsi]
add r8, rax # <-- "adc r8, rax" (+20-30% performance)
mov [rdi], r8
mulx rax, r8, [rsi+8]
adc r9, r8
mov [rdi+8], r9
mulx r9, r8, [rsi+16]
adc r8, rax
mov [rdi+16], r8
mulx rax, r8, [rsi+24]
adc r9, r8
mov [rdi+24], r9
adc rax, 0
add rdi, 32
add rsi, 32
dec ecx
jnz L_mul_word_loop
dec r10d
jnz L_bench_loop
xor eax, eax
mov rsp, rbp
pop r14
pop r13
pop r12
pop rbp
ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment