Last active
August 29, 2015 14:21
-
-
Save reinsteam/5be398c745e562ee3a69 to your computer and use it in GitHub Desktop.
log2 computation (12-16 at a time) using SSE2/AVX2 instruction set based on minimax polynomial approximation (coefficients finding is based on Remez Exchange algorithm)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.data | |
cval_one DD 03f800000r ; 1 | |
DD 03f800000r ; 1 | |
DD 03f800000r ; 1 | |
DD 03f800000r ; 1 | |
DD 03f800000r ; 1 | |
DD 03f800000r ; 1 | |
DD 03f800000r ; 1 | |
DD 03f800000r ; 1 | |
mantissa_mask DD 07fffffH | |
DD 07fffffH | |
DD 07fffffH | |
DD 07fffffH | |
DD 07fffffH | |
DD 07fffffH | |
DD 07fffffH | |
DD 07fffffH | |
exponent_shft DD 07fH | |
DD 07fH | |
DD 07fH | |
DD 07fH | |
DD 07fH | |
DD 07fH | |
DD 07fH | |
DD 07fH | |
log2_c5 DD 0bd0d0cc5r ; -0.034436 | |
DD 0bd0d0cc5r ; -0.034436 | |
DD 0bd0d0cc5r ; -0.034436 | |
DD 0bd0d0cc5r ; -0.034436 | |
DD 0bd0d0cc5r ; -0.034436 | |
DD 0bd0d0cc5r ; -0.034436 | |
DD 0bd0d0cc5r ; -0.034436 | |
DD 0bd0d0cc5r ; -0.034436 | |
log2_c2 DD 04026537br ; 2.59885 | |
DD 04026537br ; 2.59885 | |
DD 04026537br ; 2.59885 | |
DD 04026537br ; 2.59885 | |
DD 04026537br ; 2.59885 | |
DD 04026537br ; 2.59885 | |
DD 04026537br ; 2.59885 | |
DD 04026537br ; 2.59885 | |
log2_c4 DD 03ea2ecddr ; 0.318213 | |
DD 03ea2ecddr ; 0.318213 | |
DD 03ea2ecddr ; 0.318213 | |
DD 03ea2ecddr ; 0.318213 | |
DD 03ea2ecddr ; 0.318213 | |
DD 03ea2ecddr ; 0.318213 | |
DD 03ea2ecddr ; 0.318213 | |
DD 03ea2ecddr ; 0.318213 | |
log2_c1 DD 0c054bfadr ; -3.3242 | |
DD 0c054bfadr ; -3.3242 | |
DD 0c054bfadr ; -3.3242 | |
DD 0c054bfadr ; -3.3242 | |
DD 0c054bfadr ; -3.3242 | |
DD 0c054bfadr ; -3.3242 | |
DD 0c054bfadr ; -3.3242 | |
DD 0c054bfadr ; -3.3242 | |
log2_c3 DD 0bf9da2c9r ; -1.23153 | |
DD 0bf9da2c9r ; -1.23153 | |
DD 0bf9da2c9r ; -1.23153 | |
DD 0bf9da2c9r ; -1.23153 | |
DD 0bf9da2c9r ; -1.23153 | |
DD 0bf9da2c9r ; -1.23153 | |
DD 0bf9da2c9r ; -1.23153 | |
DD 0bf9da2c9r ; -1.23153 | |
log2_c0 DD 04047691ar ; 3.11579 | |
DD 04047691ar ; 3.11579 | |
DD 04047691ar ; 3.11579 | |
DD 04047691ar ; 3.11579 | |
DD 04047691ar ; 3.11579 | |
DD 04047691ar ; 3.11579 | |
DD 04047691ar ; 3.11579 | |
DD 04047691ar ; 3.11579 | |
.code | |
_mm256_log2_ps_packet32 proc | |
align 16 | |
vmovaps ymm0, ymmword ptr [rcx + 0] | |
vmovaps ymm1, ymmword ptr [rcx + 32] | |
vmovaps ymm2, ymmword ptr [rcx + 64] | |
vmovaps ymm3, ymmword ptr [rcx + 96] | |
vmovaps ymm4, ymm0 | |
vmovaps ymm5, ymm1 | |
vmovaps ymm6, ymm2 | |
vmovaps ymm7, ymm3 | |
vandps ymm0, ymm0, ymmword ptr mantissa_mask | |
vandps ymm1, ymm1, ymmword ptr mantissa_mask | |
vandps ymm2, ymm2, ymmword ptr mantissa_mask | |
vandps ymm3, ymm3, ymmword ptr mantissa_mask | |
vpsrld ymm4, ymm4, 23 | |
vpsrld ymm5, ymm5, 23 | |
vpsrld ymm6, ymm6, 23 | |
vpsrld ymm7, ymm7, 23 | |
vorps ymm0, ymm0, ymmword ptr cval_one | |
vorps ymm1, ymm1, ymmword ptr cval_one | |
vorps ymm2, ymm2, ymmword ptr cval_one | |
vorps ymm3, ymm3, ymmword ptr cval_one | |
vpsubd ymm4, ymm4, ymmword ptr exponent_shft | |
vpsubd ymm5, ymm5, ymmword ptr exponent_shft | |
vpsubd ymm6, ymm6, ymmword ptr exponent_shft | |
vpsubd ymm7, ymm7, ymmword ptr exponent_shft | |
vmovaps ymm8 , ymm0 | |
vmovaps ymm9 , ymm1 | |
vmovaps ymm10, ymm2 | |
vmovaps ymm11, ymm3 | |
vcvtdq2ps ymm4, ymm4 | |
vcvtdq2ps ymm5, ymm5 | |
vcvtdq2ps ymm6, ymm6 | |
vcvtdq2ps ymm7, ymm7 | |
vmovaps ymm12, ymm0 | |
vmovaps ymm13, ymm1 | |
vmovaps ymm14, ymm2 | |
vmovaps ymm15, ymm3 | |
vmulps ymm8 , ymm8 , ymmword ptr log2_c5 | |
vmulps ymm9 , ymm9 , ymmword ptr log2_c5 | |
vmulps ymm10, ymm10, ymmword ptr log2_c5 | |
vmulps ymm11, ymm11, ymmword ptr log2_c5 | |
vmulps ymm12, ymm12, ymmword ptr log2_c2 | |
vmulps ymm13, ymm13, ymmword ptr log2_c2 | |
vmulps ymm14, ymm14, ymmword ptr log2_c2 | |
vmulps ymm15, ymm15, ymmword ptr log2_c2 | |
vaddps ymm8 , ymm8 , ymmword ptr log2_c4 | |
vaddps ymm9 , ymm9 , ymmword ptr log2_c4 | |
vaddps ymm10, ymm10, ymmword ptr log2_c4 | |
vaddps ymm11, ymm11, ymmword ptr log2_c4 | |
vaddps ymm12, ymm12, ymmword ptr log2_c1 | |
vaddps ymm13, ymm13, ymmword ptr log2_c1 | |
vaddps ymm14, ymm14, ymmword ptr log2_c1 | |
vaddps ymm15, ymm15, ymmword ptr log2_c1 | |
vmulps ymm8 , ymm8 , ymm0 | |
vmulps ymm9 , ymm9 , ymm1 | |
vmulps ymm10, ymm10, ymm2 | |
vmulps ymm11, ymm11, ymm3 | |
vmulps ymm12, ymm12, ymm0 | |
vmulps ymm13, ymm13, ymm1 | |
vmulps ymm14, ymm14, ymm2 | |
vmulps ymm15, ymm15, ymm3 | |
vaddps ymm8 , ymm8 , ymmword ptr log2_c3 | |
vaddps ymm9 , ymm9 , ymmword ptr log2_c3 | |
vaddps ymm10, ymm10, ymmword ptr log2_c3 | |
vaddps ymm11, ymm11, ymmword ptr log2_c3 | |
vaddps ymm12, ymm12, ymmword ptr log2_c0 | |
vaddps ymm13, ymm13, ymmword ptr log2_c0 | |
vaddps ymm14, ymm14, ymmword ptr log2_c0 | |
vaddps ymm15, ymm15, ymmword ptr log2_c0 | |
vmulps ymm8 , ymm8 , ymm0 | |
vmulps ymm9 , ymm9 , ymm1 | |
vmulps ymm10, ymm10, ymm2 | |
vmulps ymm11, ymm11, ymm3 | |
vmulps ymm8 , ymm8 , ymm0 | |
vmulps ymm9 , ymm9 , ymm1 | |
vmulps ymm10, ymm10, ymm2 | |
vmulps ymm11, ymm11, ymm3 | |
vmulps ymm8 , ymm8 , ymm0 | |
vmulps ymm9 , ymm9 , ymm1 | |
vmulps ymm10, ymm10, ymm2 | |
vmulps ymm11, ymm11, ymm3 | |
vsubps ymm0, ymm0, ymmword ptr cval_one | |
vsubps ymm1, ymm1, ymmword ptr cval_one | |
vsubps ymm2, ymm2, ymmword ptr cval_one | |
vsubps ymm3, ymm3, ymmword ptr cval_one | |
vaddps ymm8 , ymm8 , ymm12 | |
vaddps ymm9 , ymm9 , ymm13 | |
vaddps ymm10, ymm10, ymm14 | |
vaddps ymm11, ymm11, ymm15 | |
vmulps ymm0, ymm0, ymm8 | |
vmulps ymm1, ymm1, ymm9 | |
vmulps ymm2, ymm2, ymm10 | |
vmulps ymm3, ymm3, ymm11 | |
vaddps ymm0, ymm0, ymm4 | |
vaddps ymm1, ymm1, ymm5 | |
vaddps ymm2, ymm2, ymm6 | |
vaddps ymm3, ymm3, ymm7 | |
vmovaps ymmword ptr[rcx + 0], ymm0 | |
vmovaps ymmword ptr[rcx + 32], ymm1 | |
vmovaps ymmword ptr[rcx + 64], ymm2 | |
vmovaps ymmword ptr[rcx + 96], ymm3 | |
ret 0 | |
_mm256_log2_ps_packet32 endp | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.data | |
cval_one DD 03f800000r, 03f800000r, 03f800000r, 03f800000r | |
mantissa_mask DD 07fffffH, 07fffffH, 07fffffH, 07fffffH | |
exponent_shft DD 000007fH, 000007fH, 000007fH, 000007fH | |
log2_c5 DD 0bd0d0cc5r, 0bd0d0cc5r, 0bd0d0cc5r, 0bd0d0cc5r ; -0.034436 | |
log2_c2 DD 04026537br, 04026537br, 04026537br, 04026537br ; 2.59885 | |
log2_c4 DD 03ea2ecddr, 03ea2ecddr, 03ea2ecddr, 03ea2ecddr ; 0.318213 | |
log2_c1 DD 0c054bfadr, 0c054bfadr, 0c054bfadr, 0c054bfadr ; -3.3242 | |
log2_c3 DD 0bf9da2c9r, 0bf9da2c9r, 0bf9da2c9r, 0bf9da2c9r ; -1.23153 | |
log2_c0 DD 04047691ar, 04047691ar, 04047691ar, 04047691ar ; 3.11579 | |
.code | |
_mm_log2_ps_packet12 proc | |
align 16 | |
movaps xmm0, xmmword ptr[rcx + 0] | |
movaps xmm1, xmmword ptr[rcx + 16] | |
movaps xmm2, xmmword ptr[rcx + 32] | |
movaps xmm3, xmm0 | |
movaps xmm4, xmm1 | |
movaps xmm5, xmm2 | |
andps xmm0, xmmword ptr mantissa_mask | |
andps xmm1, xmmword ptr mantissa_mask | |
andps xmm2, xmmword ptr mantissa_mask | |
orps xmm0, xmmword ptr cval_one | |
orps xmm1, xmmword ptr cval_one | |
orps xmm2, xmmword ptr cval_one | |
psrld xmm3, 23 | |
psrld xmm4, 23 | |
psrld xmm5, 23 | |
psubd xmm3, xmmword ptr exponent_shft | |
psubd xmm4, xmmword ptr exponent_shft | |
psubd xmm5, xmmword ptr exponent_shft | |
cvtdq2ps xmm3, xmm3 | |
cvtdq2ps xmm4, xmm4 | |
cvtdq2ps xmm5, xmm5 | |
movaps xmm6, xmm0 | |
movaps xmm7, xmm1 | |
movaps xmm8, xmm2 | |
mulps xmm6, xmm6 | |
mulps xmm7, xmm7 | |
mulps xmm8, xmm8 | |
mulps xmm6, xmm0 | |
mulps xmm7, xmm1 | |
mulps xmm8, xmm2 | |
movaps xmm9 , xmm0 | |
movaps xmm10, xmm1 | |
movaps xmm11, xmm2 | |
movaps xmm12, xmm0 | |
movaps xmm13, xmm1 | |
movaps xmm14, xmm2 | |
mulps xmm0, xmmword ptr log2_c5 | |
mulps xmm1, xmmword ptr log2_c5 | |
mulps xmm2, xmmword ptr log2_c5 | |
mulps xmm9 , xmmword ptr log2_c2 | |
mulps xmm10, xmmword ptr log2_c2 | |
mulps xmm11, xmmword ptr log2_c2 | |
addps xmm0, xmmword ptr log2_c4 | |
addps xmm1, xmmword ptr log2_c4 | |
addps xmm2, xmmword ptr log2_c4 | |
addps xmm9, xmmword ptr log2_c1 | |
addps xmm10, xmmword ptr log2_c1 | |
addps xmm11, xmmword ptr log2_c1 | |
mulps xmm0, xmm12 | |
mulps xmm1, xmm13 | |
mulps xmm2, xmm14 | |
mulps xmm9, xmm12 | |
mulps xmm10, xmm13 | |
mulps xmm11, xmm14 | |
addps xmm0, xmmword ptr log2_c3 | |
addps xmm1, xmmword ptr log2_c3 | |
addps xmm2, xmmword ptr log2_c3 | |
addps xmm9, xmmword ptr log2_c0 | |
addps xmm10, xmmword ptr log2_c0 | |
addps xmm11, xmmword ptr log2_c0 | |
mulps xmm0, xmm6 | |
mulps xmm1, xmm7 | |
mulps xmm2, xmm8 | |
subps xmm12, xmmword ptr cval_one | |
subps xmm13, xmmword ptr cval_one | |
subps xmm14, xmmword ptr cval_one | |
addps xmm0, xmm9 | |
addps xmm1, xmm10 | |
addps xmm2, xmm11 | |
mulps xmm12, xmm0 | |
mulps xmm13, xmm1 | |
mulps xmm14, xmm2 | |
addps xmm12, xmm3 | |
addps xmm13, xmm4 | |
addps xmm14, xmm5 | |
movaps xmmword ptr[rcx + 0], xmm12 | |
movaps xmmword ptr[rcx + 16], xmm13 | |
movaps xmmword ptr[rcx + 32], xmm14 | |
ret 0 | |
_mm_log2_ps_packet12 endp | |
_mm_log2_ps_packet16 proc | |
align 16 | |
movaps xmm0, xmmword ptr[rcx + 0] | |
movaps xmm1, xmmword ptr[rcx + 16] | |
movaps xmm2, xmmword ptr[rcx + 32] | |
movaps xmm3, xmmword ptr[rcx + 48] | |
movaps xmm4, xmm0 | |
movaps xmm5, xmm1 | |
movaps xmm6, xmm2 | |
movaps xmm7, xmm3 | |
andps xmm0, xmmword ptr mantissa_mask | |
andps xmm1, xmmword ptr mantissa_mask | |
andps xmm2, xmmword ptr mantissa_mask | |
andps xmm3, xmmword ptr mantissa_mask | |
psrld xmm4, 23 | |
psrld xmm5, 23 | |
psrld xmm6, 23 | |
psrld xmm7, 23 | |
orps xmm0, xmmword ptr cval_one | |
orps xmm1, xmmword ptr cval_one | |
orps xmm2, xmmword ptr cval_one | |
orps xmm3, xmmword ptr cval_one | |
psubd xmm4, xmmword ptr exponent_shft | |
psubd xmm5, xmmword ptr exponent_shft | |
psubd xmm6, xmmword ptr exponent_shft | |
psubd xmm7, xmmword ptr exponent_shft | |
cvtdq2ps xmm4, xmm4 | |
cvtdq2ps xmm5, xmm5 | |
cvtdq2ps xmm6, xmm6 | |
cvtdq2ps xmm7, xmm7 | |
movaps xmm8 , xmm0 | |
movaps xmm9 , xmm1 | |
movaps xmm10, xmm2 | |
movaps xmm11, xmm3 | |
movaps xmm12, xmm0 | |
movaps xmm13, xmm1 | |
movaps xmm14, xmm2 | |
movaps xmm15, xmm3 | |
mulps xmm8 , xmmword ptr log2_c5 | |
mulps xmm9 , xmmword ptr log2_c5 | |
mulps xmm10, xmmword ptr log2_c5 | |
mulps xmm11, xmmword ptr log2_c5 | |
mulps xmm12, xmmword ptr log2_c2 | |
mulps xmm13, xmmword ptr log2_c2 | |
mulps xmm14, xmmword ptr log2_c2 | |
mulps xmm15, xmmword ptr log2_c2 | |
addps xmm8 , xmmword ptr log2_c4 | |
addps xmm9 , xmmword ptr log2_c4 | |
addps xmm10, xmmword ptr log2_c4 | |
addps xmm11, xmmword ptr log2_c4 | |
addps xmm12, xmmword ptr log2_c1 | |
addps xmm13, xmmword ptr log2_c1 | |
addps xmm14, xmmword ptr log2_c1 | |
addps xmm15, xmmword ptr log2_c1 | |
mulps xmm8 , xmm0 | |
mulps xmm9 , xmm1 | |
mulps xmm10, xmm2 | |
mulps xmm11, xmm3 | |
mulps xmm12, xmm0 | |
mulps xmm13, xmm1 | |
mulps xmm14, xmm2 | |
mulps xmm15, xmm3 | |
addps xmm8 , xmmword ptr log2_c3 | |
addps xmm9 , xmmword ptr log2_c3 | |
addps xmm10, xmmword ptr log2_c3 | |
addps xmm11, xmmword ptr log2_c3 | |
addps xmm12, xmmword ptr log2_c0 | |
addps xmm13, xmmword ptr log2_c0 | |
addps xmm14, xmmword ptr log2_c0 | |
addps xmm15, xmmword ptr log2_c0 | |
mulps xmm8 , xmm0 | |
mulps xmm9 , xmm1 | |
mulps xmm10, xmm2 | |
mulps xmm11, xmm3 | |
mulps xmm8 , xmm0 | |
mulps xmm9 , xmm1 | |
mulps xmm10, xmm2 | |
mulps xmm11, xmm3 | |
mulps xmm8 , xmm0 | |
mulps xmm9 , xmm1 | |
mulps xmm10, xmm2 | |
mulps xmm11, xmm3 | |
subps xmm0, xmmword ptr cval_one | |
subps xmm1, xmmword ptr cval_one | |
subps xmm2, xmmword ptr cval_one | |
subps xmm3, xmmword ptr cval_one | |
addps xmm8 , xmm12 | |
addps xmm9 , xmm13 | |
addps xmm10, xmm14 | |
addps xmm11, xmm15 | |
mulps xmm0, xmm8 | |
mulps xmm1, xmm9 | |
mulps xmm2, xmm10 | |
mulps xmm3, xmm11 | |
addps xmm0, xmm4 | |
addps xmm1, xmm5 | |
addps xmm2, xmm6 | |
addps xmm3, xmm7 | |
movaps xmmword ptr[rcx + 0], xmm0 | |
movaps xmmword ptr[rcx + 16], xmm1 | |
movaps xmmword ptr[rcx + 32], xmm2 | |
movaps xmmword ptr[rcx + 48], xmm3 | |
ret 0 | |
_mm_log2_ps_packet16 endp | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment