Skip to content

Instantly share code, notes, and snippets.

@Lovesan
Created November 2, 2024 11:47
Show Gist options
  • Save Lovesan/d551a32f6796e3606ef65b1984a4832a to your computer and use it in GitHub Desktop.
Save Lovesan/d551a32f6796e3606ef65b1984a4832a to your computer and use it in GitHub Desktop.
An example from upcoming SSE/AVX math library for SBCL(and maybe other implementations)
(definline %float4-log (v)
;; min-norm-pos is the smallest non denormalized float number
(let* ((min-norm-pos (float4! (int4 #x00800000)))
(inv-mant-mask (float4! (int4 (lognot #x7f800000))))
(cephes-sqrthf (float4 +0.707106781186547524d0))
(cephes-log-p0 (float4 +7.0376836292d-2))
(cephes-log-p1 (float4 -1.1514610310d-1))
(cephes-log-p2 (float4 +1.1676998740d-1))
(cephes-log-p3 (float4 -1.2420140846d-1))
(cephes-log-p4 (float4 +1.4249322787d-1))
(cephes-log-p5 (float4 -1.6668057665d-1))
(cephes-log-p6 (float4 +2.0000714765d-1))
(cephes-log-p7 (float4 -2.4999993993d-1))
(cephes-log-p8 (float4 +3.3333331174d-1))
(cephes-log-q1 (float4 -2.12194440d-4))
(cephes-log-q2 (float4 +0.693359375d0))
(v (float4 v))
(e (float4 0))
(one (float4 1))
(invalid-mask (float4! (float4<= v (float4 0))))
(v (float4-max v min-norm-pos))
(emm0 (int4-shiftr (int4! v) 23)))
;; keep only the fractional part
(setf v (float4-and v inv-mant-mask)
v (float4-or v (float4 0.5))
emm0 (int4- emm0 (int4 #x7f))
e (float4 emm0)
e (float4+ e one))
;; part2:
;; if( x < SQRTHF ) {
;; e -= 1;
;; x = x + x - 1.0;
;; } else { x = x - 1.0; }
(let* ((mask (float4! (float4< v cephes-sqrthf)))
(tmp (float4-and v mask))
(z (float4 0))
(y (float4 0)))
(setf v (float4- v one)
e (float4- e (float4-and one mask))
v (float4+ v tmp)
z (float4* v v)
y cephes-log-p0
y (float4-fmadd y v cephes-log-p1)
y (float4-fmadd y v cephes-log-p2)
y (float4-fmadd y v cephes-log-p3)
y (float4-fmadd y v cephes-log-p4)
y (float4-fmadd y v cephes-log-p5)
y (float4-fmadd y v cephes-log-p6)
y (float4-fmadd y v cephes-log-p7)
y (float4-fmadd y v cephes-log-p8)
y (float4* y v)
y (float4* y z)
y (float4-fmadd e cephes-log-q1 y)
y (float4-fmadd z (float4 -0.5) y)
v (float4+ v y)
v (f32.4-fmadd e cephes-log-q2 v)
;; negative arg will be NAN
v (float4-or v invalid-mask))
v)))
(definline float4-log (x &optional (base nil basep))
(let ((x (float4 x))
(base (if basep (float4 base) (float4 0))))
(if basep
(float4/ (%float4-log x)
(%float4-log base))
(%float4-log x))))
(disassemble
(lambda (x)
(declare (type single-float x) (optimize (speed 3) (safety 0) (debug 0)))
(float4-log x))) ;; boxing of (SIMD-PACK SINGLE-FLOAT) and such could be avoided if such objects stay inside functions
; disassembly for (LAMBDA (X))
; Size: 406 bytes. Origin: #x10011B8D94 ; (LAMBDA (X))
; D94: 41844424F8 TEST AL, [R12-8] ; safepoint
; D99: C4E27918C0 VBROADCASTSS XMM0, XMM0
; D9E: 0F57C9 XORPS XMM1, XMM1
; DA1: C5F8C2F102 VCMPLEPS XMM6, XMM0, XMM1
; DA6: 0F280DA3FEFFFF MOVAPS XMM1, [RIP-349] ; [#x10011B8C50]
; DAD: C5F85FC1 VMAXPS XMM0, XMM0, XMM1
; DB1: 0F28D8 MOVAPS XMM3, XMM0
; DB4: 660F6FC3 MOVDQA XMM0, XMM3
; DB8: 660F72D017 PSRLD-IMM XMM0, 23
; DBD: 0F28159CFEFFFF MOVAPS XMM2, [RIP-356] ; [#x10011B8C60]
; DC4: C5E054CA VANDPS XMM1, XMM3, XMM2
; DC8: 0F28D9 MOVAPS XMM3, XMM1
; DCB: 0F28159EFEFFFF MOVAPS XMM2, [RIP-354] ; [#x10011B8C70]
; DD2: C5E056CA VORPS XMM1, XMM3, XMM2
; DD6: 0F28D9 MOVAPS XMM3, XMM1
; DD9: 660F6F0D9FFEFFFF MOVDQA XMM1, [RIP-353] ; [#x10011B8C80]
; DE1: C5F9FAC1 VPSUBD XMM0, XMM0, XMM1
; DE5: C5F85BC0 VCVTDQ2PS XMM0, XMM0
; DE9: 0F28E0 MOVAPS XMM4, XMM0
; DEC: 0F280D9DFEFFFF MOVAPS XMM1, [RIP-355] ; [#x10011B8C90]
; DF3: C5D858C1 VADDPS XMM0, XMM4, XMM1
; DF7: 0F28E0 MOVAPS XMM4, XMM0
; DFA: 0F28059FFEFFFF MOVAPS XMM0, [RIP-353] ; [#x10011B8CA0]
; E01: C5E0C2C001 VCMPLTPS XMM0, XMM3, XMM0
; E06: C5E054D0 VANDPS XMM2, XMM3, XMM0
; E0A: 0F280D7FFEFFFF MOVAPS XMM1, [RIP-385] ; [#x10011B8C90]
; E11: C5E05CC9 VSUBPS XMM1, XMM3, XMM1
; E15: 0F28D9 MOVAPS XMM3, XMM1
; E18: 0F280D71FEFFFF MOVAPS XMM1, [RIP-399] ; [#x10011B8C90]
; E1F: C5F054C8 VANDPS XMM1, XMM1, XMM0
; E23: C5D85CC1 VSUBPS XMM0, XMM4, XMM1
; E27: 0F28E0 MOVAPS XMM4, XMM0
; E2A: C5E058C2 VADDPS XMM0, XMM3, XMM2
; E2E: 0F28D8 MOVAPS XMM3, XMM0
; E31: C5E059EB VMULPS XMM5, XMM3, XMM3
; E35: 0F280D74FEFFFF MOVAPS XMM1, [RIP-396] ; [#x10011B8CB0]
; E3C: 0F28157DFEFFFF MOVAPS XMM2, [RIP-387] ; [#x10011B8CC0]
; E43: C4E261A8CA VFMADD213PS XMM1, XMM3, XMM2
; E48: 0F281581FEFFFF MOVAPS XMM2, [RIP-383] ; [#x10011B8CD0]
; E4F: C4E261A8CA VFMADD213PS XMM1, XMM3, XMM2
; E54: 0F281585FEFFFF MOVAPS XMM2, [RIP-379] ; [#x10011B8CE0]
; E5B: C4E261A8CA VFMADD213PS XMM1, XMM3, XMM2
; E60: 0F281589FEFFFF MOVAPS XMM2, [RIP-375] ; [#x10011B8CF0]
; E67: C4E261A8CA VFMADD213PS XMM1, XMM3, XMM2
; E6C: 0F28158DFEFFFF MOVAPS XMM2, [RIP-371] ; [#x10011B8D00]
; E73: C4E261A8CA VFMADD213PS XMM1, XMM3, XMM2
; E78: 0F281591FEFFFF MOVAPS XMM2, [RIP-367] ; [#x10011B8D10]
; E7F: C4E261A8CA VFMADD213PS XMM1, XMM3, XMM2
; E84: 0F281595FEFFFF MOVAPS XMM2, [RIP-363] ; [#x10011B8D20]
; E8B: C4E261A8CA VFMADD213PS XMM1, XMM3, XMM2
; E90: 0F281599FEFFFF MOVAPS XMM2, [RIP-359] ; [#x10011B8D30]
; E97: C4E261A8CA VFMADD213PS XMM1, XMM3, XMM2
; E9C: C5F059CB VMULPS XMM1, XMM1, XMM3
; EA0: C5F059CD VMULPS XMM1, XMM1, XMM5
; EA4: 0F283D95FEFFFF MOVAPS XMM7, [RIP-363] ; [#x10011B8D40]
; EAB: 0F28D4 MOVAPS XMM2, XMM4
; EAE: C4E241A8D1 VFMADD213PS XMM2, XMM7, XMM1
; EB3: 0F28CA MOVAPS XMM1, XMM2
; EB6: 0F283D93FEFFFF MOVAPS XMM7, [RIP-365] ; [#x10011B8D50]
; EBD: 0F28D5 MOVAPS XMM2, XMM5
; EC0: C4E241A8D1 VFMADD213PS XMM2, XMM7, XMM1
; EC5: 0F28CA MOVAPS XMM1, XMM2
; EC8: C5E058C1 VADDPS XMM0, XMM3, XMM1
; ECC: 0F28D8 MOVAPS XMM3, XMM0
; ECF: 0F280D8AFEFFFF MOVAPS XMM1, [RIP-374] ; [#x10011B8D60]
; ED6: 0F28C4 MOVAPS XMM0, XMM4
; ED9: C4E271A8C3 VFMADD213PS XMM0, XMM1, XMM3
; EDE: 0F28C8 MOVAPS XMM1, XMM0
; EE1: 0F28D9 MOVAPS XMM3, XMM1
; EE4: C5E056C6 VORPS XMM0, XMM3, XMM6
; EE8: 0F28D8 MOVAPS XMM3, XMM0
; EEB: 498B5570 MOV RDX, [R13+112] ; thread.mixed-tlab
; EEF: 4883C220 ADD RDX, 32
; EF3: 493B5578 CMP RDX, [R13+120]
; EF7: 7722 JA L1
; EF9: 49895570 MOV [R13+112], RDX ; thread.mixed-tlab
; EFD: 4883C2EF ADD RDX, -17
; F01: L0: 66C742F16503 MOV WORD PTR [RDX-15], 869
; F07: 41844424F8 TEST AL, [R12-8] ; safepoint
; F0C: 48C742F900000000 MOV QWORD PTR [RDX-7], 0
; F14: 0F295A01 MOVAPS [RDX+1], XMM3
; F18: C9 LEAVE
; F19: F8 CLC
; F1A: C3 RET
; F1B: L1: 6A20 PUSH 32
; F1D: FF142578040120 CALL [#x20010478] ; #x10000004F0: ALLOC-TRAMP
; F24: 5A POP RDX
; F25: 80CA0F OR DL, 15
; F28: EBD7 JMP L0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment