Created
November 2, 2024 11:47
-
-
Save Lovesan/d551a32f6796e3606ef65b1984a4832a to your computer and use it in GitHub Desktop.
An example from upcoming SSE/AVX math library for SBCL(and maybe other implementations)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(definline %float4-log (v) | |
;; min-norm-pos is the smallest non denormalized float number | |
(let* ((min-norm-pos (float4! (int4 #x00800000))) | |
(inv-mant-mask (float4! (int4 (lognot #x7f800000)))) | |
(cephes-sqrthf (float4 +0.707106781186547524d0)) | |
(cephes-log-p0 (float4 +7.0376836292d-2)) | |
(cephes-log-p1 (float4 -1.1514610310d-1)) | |
(cephes-log-p2 (float4 +1.1676998740d-1)) | |
(cephes-log-p3 (float4 -1.2420140846d-1)) | |
(cephes-log-p4 (float4 +1.4249322787d-1)) | |
(cephes-log-p5 (float4 -1.6668057665d-1)) | |
(cephes-log-p6 (float4 +2.0000714765d-1)) | |
(cephes-log-p7 (float4 -2.4999993993d-1)) | |
(cephes-log-p8 (float4 +3.3333331174d-1)) | |
(cephes-log-q1 (float4 -2.12194440d-4)) | |
(cephes-log-q2 (float4 +0.693359375d0)) | |
(v (float4 v)) | |
(e (float4 0)) | |
(one (float4 1)) | |
(invalid-mask (float4! (float4<= v (float4 0)))) | |
(v (float4-max v min-norm-pos)) | |
(emm0 (int4-shiftr (int4! v) 23))) | |
;; keep only the fractional part | |
(setf v (float4-and v inv-mant-mask) | |
v (float4-or v (float4 0.5)) | |
emm0 (int4- emm0 (int4 #x7f)) | |
e (float4 emm0) | |
e (float4+ e one)) | |
;; part2: | |
;; if( x < SQRTHF ) { | |
;; e -= 1; | |
;; x = x + x - 1.0; | |
;; } else { x = x - 1.0; } | |
(let* ((mask (float4! (float4< v cephes-sqrthf))) | |
(tmp (float4-and v mask)) | |
(z (float4 0)) | |
(y (float4 0))) | |
(setf v (float4- v one) | |
e (float4- e (float4-and one mask)) | |
v (float4+ v tmp) | |
z (float4* v v) | |
y cephes-log-p0 | |
y (float4-fmadd y v cephes-log-p1) | |
y (float4-fmadd y v cephes-log-p2) | |
y (float4-fmadd y v cephes-log-p3) | |
y (float4-fmadd y v cephes-log-p4) | |
y (float4-fmadd y v cephes-log-p5) | |
y (float4-fmadd y v cephes-log-p6) | |
y (float4-fmadd y v cephes-log-p7) | |
y (float4-fmadd y v cephes-log-p8) | |
y (float4* y v) | |
y (float4* y z) | |
y (float4-fmadd e cephes-log-q1 y) | |
y (float4-fmadd z (float4 -0.5) y) | |
v (float4+ v y) | |
v (f32.4-fmadd e cephes-log-q2 v) | |
;; negative arg will be NAN | |
v (float4-or v invalid-mask)) | |
v))) | |
(definline float4-log (x &optional (base nil basep)) | |
(let ((x (float4 x)) | |
(base (if basep (float4 base) (float4 0)))) | |
(if basep | |
(float4/ (%float4-log x) | |
(%float4-log base)) | |
(%float4-log x)))) | |
(disassemble | |
(lambda (x) | |
(declare (type single-float x) (optimize (speed 3) (safety 0) (debug 0))) | |
(float4-log x))) ;; boxing of (SIMD-PACK SINGLE-FLOAT) and such could be avoided if such objects stay inside functions | |
; disassembly for (LAMBDA (X)) | |
; Size: 406 bytes. Origin: #x10011B8D94 ; (LAMBDA (X)) | |
; D94: 41844424F8 TEST AL, [R12-8] ; safepoint | |
; D99: C4E27918C0 VBROADCASTSS XMM0, XMM0 | |
; D9E: 0F57C9 XORPS XMM1, XMM1 | |
; DA1: C5F8C2F102 VCMPLEPS XMM6, XMM0, XMM1 | |
; DA6: 0F280DA3FEFFFF MOVAPS XMM1, [RIP-349] ; [#x10011B8C50] | |
; DAD: C5F85FC1 VMAXPS XMM0, XMM0, XMM1 | |
; DB1: 0F28D8 MOVAPS XMM3, XMM0 | |
; DB4: 660F6FC3 MOVDQA XMM0, XMM3 | |
; DB8: 660F72D017 PSRLD-IMM XMM0, 23 | |
; DBD: 0F28159CFEFFFF MOVAPS XMM2, [RIP-356] ; [#x10011B8C60] | |
; DC4: C5E054CA VANDPS XMM1, XMM3, XMM2 | |
; DC8: 0F28D9 MOVAPS XMM3, XMM1 | |
; DCB: 0F28159EFEFFFF MOVAPS XMM2, [RIP-354] ; [#x10011B8C70] | |
; DD2: C5E056CA VORPS XMM1, XMM3, XMM2 | |
; DD6: 0F28D9 MOVAPS XMM3, XMM1 | |
; DD9: 660F6F0D9FFEFFFF MOVDQA XMM1, [RIP-353] ; [#x10011B8C80] | |
; DE1: C5F9FAC1 VPSUBD XMM0, XMM0, XMM1 | |
; DE5: C5F85BC0 VCVTDQ2PS XMM0, XMM0 | |
; DE9: 0F28E0 MOVAPS XMM4, XMM0 | |
; DEC: 0F280D9DFEFFFF MOVAPS XMM1, [RIP-355] ; [#x10011B8C90] | |
; DF3: C5D858C1 VADDPS XMM0, XMM4, XMM1 | |
; DF7: 0F28E0 MOVAPS XMM4, XMM0 | |
; DFA: 0F28059FFEFFFF MOVAPS XMM0, [RIP-353] ; [#x10011B8CA0] | |
; E01: C5E0C2C001 VCMPLTPS XMM0, XMM3, XMM0 | |
; E06: C5E054D0 VANDPS XMM2, XMM3, XMM0 | |
; E0A: 0F280D7FFEFFFF MOVAPS XMM1, [RIP-385] ; [#x10011B8C90] | |
; E11: C5E05CC9 VSUBPS XMM1, XMM3, XMM1 | |
; E15: 0F28D9 MOVAPS XMM3, XMM1 | |
; E18: 0F280D71FEFFFF MOVAPS XMM1, [RIP-399] ; [#x10011B8C90] | |
; E1F: C5F054C8 VANDPS XMM1, XMM1, XMM0 | |
; E23: C5D85CC1 VSUBPS XMM0, XMM4, XMM1 | |
; E27: 0F28E0 MOVAPS XMM4, XMM0 | |
; E2A: C5E058C2 VADDPS XMM0, XMM3, XMM2 | |
; E2E: 0F28D8 MOVAPS XMM3, XMM0 | |
; E31: C5E059EB VMULPS XMM5, XMM3, XMM3 | |
; E35: 0F280D74FEFFFF MOVAPS XMM1, [RIP-396] ; [#x10011B8CB0] | |
; E3C: 0F28157DFEFFFF MOVAPS XMM2, [RIP-387] ; [#x10011B8CC0] | |
; E43: C4E261A8CA VFMADD213PS XMM1, XMM3, XMM2 | |
; E48: 0F281581FEFFFF MOVAPS XMM2, [RIP-383] ; [#x10011B8CD0] | |
; E4F: C4E261A8CA VFMADD213PS XMM1, XMM3, XMM2 | |
; E54: 0F281585FEFFFF MOVAPS XMM2, [RIP-379] ; [#x10011B8CE0] | |
; E5B: C4E261A8CA VFMADD213PS XMM1, XMM3, XMM2 | |
; E60: 0F281589FEFFFF MOVAPS XMM2, [RIP-375] ; [#x10011B8CF0] | |
; E67: C4E261A8CA VFMADD213PS XMM1, XMM3, XMM2 | |
; E6C: 0F28158DFEFFFF MOVAPS XMM2, [RIP-371] ; [#x10011B8D00] | |
; E73: C4E261A8CA VFMADD213PS XMM1, XMM3, XMM2 | |
; E78: 0F281591FEFFFF MOVAPS XMM2, [RIP-367] ; [#x10011B8D10] | |
; E7F: C4E261A8CA VFMADD213PS XMM1, XMM3, XMM2 | |
; E84: 0F281595FEFFFF MOVAPS XMM2, [RIP-363] ; [#x10011B8D20] | |
; E8B: C4E261A8CA VFMADD213PS XMM1, XMM3, XMM2 | |
; E90: 0F281599FEFFFF MOVAPS XMM2, [RIP-359] ; [#x10011B8D30] | |
; E97: C4E261A8CA VFMADD213PS XMM1, XMM3, XMM2 | |
; E9C: C5F059CB VMULPS XMM1, XMM1, XMM3 | |
; EA0: C5F059CD VMULPS XMM1, XMM1, XMM5 | |
; EA4: 0F283D95FEFFFF MOVAPS XMM7, [RIP-363] ; [#x10011B8D40] | |
; EAB: 0F28D4 MOVAPS XMM2, XMM4 | |
; EAE: C4E241A8D1 VFMADD213PS XMM2, XMM7, XMM1 | |
; EB3: 0F28CA MOVAPS XMM1, XMM2 | |
; EB6: 0F283D93FEFFFF MOVAPS XMM7, [RIP-365] ; [#x10011B8D50] | |
; EBD: 0F28D5 MOVAPS XMM2, XMM5 | |
; EC0: C4E241A8D1 VFMADD213PS XMM2, XMM7, XMM1 | |
; EC5: 0F28CA MOVAPS XMM1, XMM2 | |
; EC8: C5E058C1 VADDPS XMM0, XMM3, XMM1 | |
; ECC: 0F28D8 MOVAPS XMM3, XMM0 | |
; ECF: 0F280D8AFEFFFF MOVAPS XMM1, [RIP-374] ; [#x10011B8D60] | |
; ED6: 0F28C4 MOVAPS XMM0, XMM4 | |
; ED9: C4E271A8C3 VFMADD213PS XMM0, XMM1, XMM3 | |
; EDE: 0F28C8 MOVAPS XMM1, XMM0 | |
; EE1: 0F28D9 MOVAPS XMM3, XMM1 | |
; EE4: C5E056C6 VORPS XMM0, XMM3, XMM6 | |
; EE8: 0F28D8 MOVAPS XMM3, XMM0 | |
; EEB: 498B5570 MOV RDX, [R13+112] ; thread.mixed-tlab | |
; EEF: 4883C220 ADD RDX, 32 | |
; EF3: 493B5578 CMP RDX, [R13+120] | |
; EF7: 7722 JA L1 | |
; EF9: 49895570 MOV [R13+112], RDX ; thread.mixed-tlab | |
; EFD: 4883C2EF ADD RDX, -17 | |
; F01: L0: 66C742F16503 MOV WORD PTR [RDX-15], 869 | |
; F07: 41844424F8 TEST AL, [R12-8] ; safepoint | |
; F0C: 48C742F900000000 MOV QWORD PTR [RDX-7], 0 | |
; F14: 0F295A01 MOVAPS [RDX+1], XMM3 | |
; F18: C9 LEAVE | |
; F19: F8 CLC | |
; F1A: C3 RET | |
; F1B: L1: 6A20 PUSH 32 | |
; F1D: FF142578040120 CALL [#x20010478] ; #x10000004F0: ALLOC-TRAMP | |
; F24: 5A POP RDX | |
; F25: 80CA0F OR DL, 15 | |
; F28: EBD7 JMP L0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment