stepantubanov · January 25, 2021 10:31
diff --git a/add-adc.asm b/add-adc.asm
 .intel_syntax noprefix

 .global _main

 _main:
  push rbp
  push r12
  push r13
  push r14
  mov rbp, rsp

  mov r12d, 300     # R12D is number of 32 byte blocks.

  mov r11d, r12d
  shl r11d, 5       # R11D = Size of large integer in bytes (R12D * 32).
                    # 300 x 32 = 9.6 KB (so total 19.2 KB).
  sub rsp, r11
  sub rsp, r11      # Allocate 2*EAX on stack.
  and rsp, -32      # Align on 32-byte boundary.

  lea r13, [rsp+r11]
  mov r10d, 5000000       # R10D = Number of benchmark loop iterations (5 mil).
  mov rdx, 0x12345678     # RDX = Multiplier.

 L_bench_loop:
  mov rsi, rsp
  mov rdi, r13
  mov ecx, r12d

  # Code to multiply large integer by a qword.
  # RSI = input large integer (qword array).
  # RDI = output large integer (qword array).
  # RDX = qword to multiply the large integer by.
  # ECX = number of 32-byte blocks to process (i.e. qwords count / 4).
  # RAX = carry

  xor eax, eax

 .p2align 4
 L_mul_word_loop:
  mulx r9, r8, [rsi]
  add r8, rax         #  <-- "adc r8, rax" (+20-30% performance)
  mov [rdi], r8

  mulx rax, r8, [rsi+8]
  adc r9, r8
  mov [rdi+8], r9

  mulx r9, r8, [rsi+16]
  adc r8, rax
  mov [rdi+16], r8

  mulx rax, r8, [rsi+24]
  adc r9, r8
  mov [rdi+24], r9

  adc rax, 0
  add rdi, 32
  add rsi, 32

  dec ecx
  jnz L_mul_word_loop

  dec r10d
  jnz L_bench_loop

  xor eax, eax
  mov rsp, rbp
  pop r14
  pop r13
  pop r12
  pop rbp
  ret
	.intel_syntax noprefix

	.global _main

	_main:
	push rbp
	push r12
	push r13
	push r14
	mov rbp, rsp

	mov r12d, 300 # R12D is number of 32 byte blocks.

	mov r11d, r12d
	shl r11d, 5 # R11D = Size of large integer in bytes (R12D * 32).
	# 300 x 32 = 9.6 KB (so total 19.2 KB).
	sub rsp, r11
	sub rsp, r11 # Allocate 2*EAX on stack.
	and rsp, -32 # Align on 32-byte boundary.

	lea r13, [rsp+r11]
	mov r10d, 5000000 # R10D = Number of benchmark loop iterations (5 mil).
	mov rdx, 0x12345678 # RDX = Multiplier.

	L_bench_loop:
	mov rsi, rsp
	mov rdi, r13
	mov ecx, r12d

	# Code to multiply large integer by a qword.
	# RSI = input large integer (qword array).
	# RDI = output large integer (qword array).
	# RDX = qword to multiply the large integer by.
	# ECX = number of 32-byte blocks to process (i.e. qwords count / 4).
	# RAX = carry

	xor eax, eax

	.p2align 4
	L_mul_word_loop:
	mulx r9, r8, [rsi]
	add r8, rax # <-- "adc r8, rax" (+20-30% performance)
	mov [rdi], r8

	mulx rax, r8, [rsi+8]
	adc r9, r8
	mov [rdi+8], r9

	mulx r9, r8, [rsi+16]
	adc r8, rax
	mov [rdi+16], r8

	mulx rax, r8, [rsi+24]
	adc r9, r8
	mov [rdi+24], r9

	adc rax, 0
	add rdi, 32
	add rsi, 32

	dec ecx
	jnz L_mul_word_loop

	dec r10d
	jnz L_bench_loop

	xor eax, eax
	mov rsp, rbp
	pop r14
	pop r13
	pop r12
	pop rbp
	ret
No results found