Skip to content

Instantly share code, notes, and snippets.

@ifeelagood
Created October 9, 2024 04:25
Show Gist options
  • Save ifeelagood/a456be29d919b37565576d15b0ea0056 to your computer and use it in GitHub Desktop.
Save ifeelagood/a456be29d919b37565576d15b0ea0056 to your computer and use it in GitHub Desktop.
x86 assember for AVX2 escape time algorithm, no memory access
; void escape_time(float* cx, float* cy, unsigned* iters, unsigned maxit) {
; rcx, rdx, r8, r9
;ymm0 cx
;ymm1 cy
;ymm2 zx
;ymm3 zy
;ymm4 zx2
;ymm5 zy2
;ymm6 tmp_zx
;ymm7 tmp_zy
;ymm8 mask
;ymm9 iters
;ymm10 maxit
;ymm11 two
;ymm12 four
;ymm13 one
;ymm14 zxzy
;ymm15
.code
PUBLIC escape_time
escape_time PROC
push rbp
mov rbp,rsp
; load cx and cy
vmovaps ymm0, YMMWORD PTR [rcx]
vmovaps ymm1, YMMWORD PTR [rdx]
; zero zx,zy
vxorps ymm2, ymm2, ymm2
vxorps ymm3, ymm3, ymm3
vxorps ymm14,ymm14,ymm14
; zero iters
vpxor ymm9, ymm9, ymm9
; set max iters
movd xmm10,r9d
vpbroadcastd ymm10,xmm10
; constants
vmovaps ymm11, YMMWORD PTR [v_two]
vmovaps ymm12, YMMWORD PTR [v_four]
vmovdqa ymm13, YMMWORD PTR [v_one]
escape_loop:
; zx2 and zy2
vmulps ymm4, ymm2, ymm2
vmulps ymm5, ymm3, ymm3
vaddps ymm8, ymm4, ymm5 ; mag = zx2 + zy2
vcmpltps ymm8, ymm8, ymm12 ; ymm8 = mag < 4.0 (escape condition not met)
vpcmpgtd ymm7, ymm10, ymm9 ; ymm13 = iters < maxit
vpand ymm8, ymm8, ymm7 ; active = iters < maxit & mag < 4.0
vptest ymm8, ymm8 ; if all are zero, then we are done
jz escape
; iters += 1 & active
vpand ymm7, ymm8, v_one
vpaddd ymm9, ymm9, ymm7
; tmp_zx = zx2 - zy2 + cx
vsubps ymm6, ymm4, ymm5
vaddps ymm6, ymm6, ymm0
; tmp_zy = 2 * zx * zy + cy
; vmulps ymm7, ymm7, ymm11
; switch to vaddps for faster
vmulps ymm7, ymm2, ymm3 ; tmp_zy = zx * zy
vaddps ymm7, ymm7, ymm7 ; tmp_zy = 2 * zx * zy
vaddps ymm7, ymm7, ymm1 ; tmp_zy += cy
vblendvps ymm2, ymm2, ymm6, ymm8 ; ymm2 = active ? tmp : zx
vblendvps ymm3, ymm3, ymm7, ymm8 ; ymm3 = active ? zy_new : zy
jmp escape_loop
escape:
vmovdqu YMMWORD PTR [r8],ymm9
done:
;mov rsp,rbp
pop rbp
ret
escape_time ENDP
.data
v_two REAL4 2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
v_four REAL4 4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
v_one DWORD 1,1,1,1,1,1,1,1
END
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment