Skip to content

Instantly share code, notes, and snippets.

@reinsteam
Last active August 29, 2015 14:21
Show Gist options
  • Save reinsteam/5be398c745e562ee3a69 to your computer and use it in GitHub Desktop.
Save reinsteam/5be398c745e562ee3a69 to your computer and use it in GitHub Desktop.
log2 computation (12-16 at a time) using SSE2/AVX2 instruction set based on minimax polynomial approximation (coefficients finding is based on Remez Exchange algorithm)
.data
cval_one DD 03f800000r ; 1
DD 03f800000r ; 1
DD 03f800000r ; 1
DD 03f800000r ; 1
DD 03f800000r ; 1
DD 03f800000r ; 1
DD 03f800000r ; 1
DD 03f800000r ; 1
mantissa_mask DD 07fffffH
DD 07fffffH
DD 07fffffH
DD 07fffffH
DD 07fffffH
DD 07fffffH
DD 07fffffH
DD 07fffffH
exponent_shft DD 07fH
DD 07fH
DD 07fH
DD 07fH
DD 07fH
DD 07fH
DD 07fH
DD 07fH
log2_c5 DD 0bd0d0cc5r ; -0.034436
DD 0bd0d0cc5r ; -0.034436
DD 0bd0d0cc5r ; -0.034436
DD 0bd0d0cc5r ; -0.034436
DD 0bd0d0cc5r ; -0.034436
DD 0bd0d0cc5r ; -0.034436
DD 0bd0d0cc5r ; -0.034436
DD 0bd0d0cc5r ; -0.034436
log2_c2 DD 04026537br ; 2.59885
DD 04026537br ; 2.59885
DD 04026537br ; 2.59885
DD 04026537br ; 2.59885
DD 04026537br ; 2.59885
DD 04026537br ; 2.59885
DD 04026537br ; 2.59885
DD 04026537br ; 2.59885
log2_c4 DD 03ea2ecddr ; 0.318213
DD 03ea2ecddr ; 0.318213
DD 03ea2ecddr ; 0.318213
DD 03ea2ecddr ; 0.318213
DD 03ea2ecddr ; 0.318213
DD 03ea2ecddr ; 0.318213
DD 03ea2ecddr ; 0.318213
DD 03ea2ecddr ; 0.318213
log2_c1 DD 0c054bfadr ; -3.3242
DD 0c054bfadr ; -3.3242
DD 0c054bfadr ; -3.3242
DD 0c054bfadr ; -3.3242
DD 0c054bfadr ; -3.3242
DD 0c054bfadr ; -3.3242
DD 0c054bfadr ; -3.3242
DD 0c054bfadr ; -3.3242
log2_c3 DD 0bf9da2c9r ; -1.23153
DD 0bf9da2c9r ; -1.23153
DD 0bf9da2c9r ; -1.23153
DD 0bf9da2c9r ; -1.23153
DD 0bf9da2c9r ; -1.23153
DD 0bf9da2c9r ; -1.23153
DD 0bf9da2c9r ; -1.23153
DD 0bf9da2c9r ; -1.23153
log2_c0 DD 04047691ar ; 3.11579
DD 04047691ar ; 3.11579
DD 04047691ar ; 3.11579
DD 04047691ar ; 3.11579
DD 04047691ar ; 3.11579
DD 04047691ar ; 3.11579
DD 04047691ar ; 3.11579
DD 04047691ar ; 3.11579
.code
_mm256_log2_ps_packet32 proc
align 16
vmovaps ymm0, ymmword ptr [rcx + 0]
vmovaps ymm1, ymmword ptr [rcx + 32]
vmovaps ymm2, ymmword ptr [rcx + 64]
vmovaps ymm3, ymmword ptr [rcx + 96]
vmovaps ymm4, ymm0
vmovaps ymm5, ymm1
vmovaps ymm6, ymm2
vmovaps ymm7, ymm3
vandps ymm0, ymm0, ymmword ptr mantissa_mask
vandps ymm1, ymm1, ymmword ptr mantissa_mask
vandps ymm2, ymm2, ymmword ptr mantissa_mask
vandps ymm3, ymm3, ymmword ptr mantissa_mask
vpsrld ymm4, ymm4, 23
vpsrld ymm5, ymm5, 23
vpsrld ymm6, ymm6, 23
vpsrld ymm7, ymm7, 23
vorps ymm0, ymm0, ymmword ptr cval_one
vorps ymm1, ymm1, ymmword ptr cval_one
vorps ymm2, ymm2, ymmword ptr cval_one
vorps ymm3, ymm3, ymmword ptr cval_one
vpsubd ymm4, ymm4, ymmword ptr exponent_shft
vpsubd ymm5, ymm5, ymmword ptr exponent_shft
vpsubd ymm6, ymm6, ymmword ptr exponent_shft
vpsubd ymm7, ymm7, ymmword ptr exponent_shft
vmovaps ymm8 , ymm0
vmovaps ymm9 , ymm1
vmovaps ymm10, ymm2
vmovaps ymm11, ymm3
vcvtdq2ps ymm4, ymm4
vcvtdq2ps ymm5, ymm5
vcvtdq2ps ymm6, ymm6
vcvtdq2ps ymm7, ymm7
vmovaps ymm12, ymm0
vmovaps ymm13, ymm1
vmovaps ymm14, ymm2
vmovaps ymm15, ymm3
vmulps ymm8 , ymm8 , ymmword ptr log2_c5
vmulps ymm9 , ymm9 , ymmword ptr log2_c5
vmulps ymm10, ymm10, ymmword ptr log2_c5
vmulps ymm11, ymm11, ymmword ptr log2_c5
vmulps ymm12, ymm12, ymmword ptr log2_c2
vmulps ymm13, ymm13, ymmword ptr log2_c2
vmulps ymm14, ymm14, ymmword ptr log2_c2
vmulps ymm15, ymm15, ymmword ptr log2_c2
vaddps ymm8 , ymm8 , ymmword ptr log2_c4
vaddps ymm9 , ymm9 , ymmword ptr log2_c4
vaddps ymm10, ymm10, ymmword ptr log2_c4
vaddps ymm11, ymm11, ymmword ptr log2_c4
vaddps ymm12, ymm12, ymmword ptr log2_c1
vaddps ymm13, ymm13, ymmword ptr log2_c1
vaddps ymm14, ymm14, ymmword ptr log2_c1
vaddps ymm15, ymm15, ymmword ptr log2_c1
vmulps ymm8 , ymm8 , ymm0
vmulps ymm9 , ymm9 , ymm1
vmulps ymm10, ymm10, ymm2
vmulps ymm11, ymm11, ymm3
vmulps ymm12, ymm12, ymm0
vmulps ymm13, ymm13, ymm1
vmulps ymm14, ymm14, ymm2
vmulps ymm15, ymm15, ymm3
vaddps ymm8 , ymm8 , ymmword ptr log2_c3
vaddps ymm9 , ymm9 , ymmword ptr log2_c3
vaddps ymm10, ymm10, ymmword ptr log2_c3
vaddps ymm11, ymm11, ymmword ptr log2_c3
vaddps ymm12, ymm12, ymmword ptr log2_c0
vaddps ymm13, ymm13, ymmword ptr log2_c0
vaddps ymm14, ymm14, ymmword ptr log2_c0
vaddps ymm15, ymm15, ymmword ptr log2_c0
vmulps ymm8 , ymm8 , ymm0
vmulps ymm9 , ymm9 , ymm1
vmulps ymm10, ymm10, ymm2
vmulps ymm11, ymm11, ymm3
vmulps ymm8 , ymm8 , ymm0
vmulps ymm9 , ymm9 , ymm1
vmulps ymm10, ymm10, ymm2
vmulps ymm11, ymm11, ymm3
vmulps ymm8 , ymm8 , ymm0
vmulps ymm9 , ymm9 , ymm1
vmulps ymm10, ymm10, ymm2
vmulps ymm11, ymm11, ymm3
vsubps ymm0, ymm0, ymmword ptr cval_one
vsubps ymm1, ymm1, ymmword ptr cval_one
vsubps ymm2, ymm2, ymmword ptr cval_one
vsubps ymm3, ymm3, ymmword ptr cval_one
vaddps ymm8 , ymm8 , ymm12
vaddps ymm9 , ymm9 , ymm13
vaddps ymm10, ymm10, ymm14
vaddps ymm11, ymm11, ymm15
vmulps ymm0, ymm0, ymm8
vmulps ymm1, ymm1, ymm9
vmulps ymm2, ymm2, ymm10
vmulps ymm3, ymm3, ymm11
vaddps ymm0, ymm0, ymm4
vaddps ymm1, ymm1, ymm5
vaddps ymm2, ymm2, ymm6
vaddps ymm3, ymm3, ymm7
vmovaps ymmword ptr[rcx + 0], ymm0
vmovaps ymmword ptr[rcx + 32], ymm1
vmovaps ymmword ptr[rcx + 64], ymm2
vmovaps ymmword ptr[rcx + 96], ymm3
ret 0
_mm256_log2_ps_packet32 endp
end
.data
cval_one DD 03f800000r, 03f800000r, 03f800000r, 03f800000r
mantissa_mask DD 07fffffH, 07fffffH, 07fffffH, 07fffffH
exponent_shft DD 000007fH, 000007fH, 000007fH, 000007fH
log2_c5 DD 0bd0d0cc5r, 0bd0d0cc5r, 0bd0d0cc5r, 0bd0d0cc5r ; -0.034436
log2_c2 DD 04026537br, 04026537br, 04026537br, 04026537br ; 2.59885
log2_c4 DD 03ea2ecddr, 03ea2ecddr, 03ea2ecddr, 03ea2ecddr ; 0.318213
log2_c1 DD 0c054bfadr, 0c054bfadr, 0c054bfadr, 0c054bfadr ; -3.3242
log2_c3 DD 0bf9da2c9r, 0bf9da2c9r, 0bf9da2c9r, 0bf9da2c9r ; -1.23153
log2_c0 DD 04047691ar, 04047691ar, 04047691ar, 04047691ar ; 3.11579
.code
_mm_log2_ps_packet12 proc
align 16
movaps xmm0, xmmword ptr[rcx + 0]
movaps xmm1, xmmword ptr[rcx + 16]
movaps xmm2, xmmword ptr[rcx + 32]
movaps xmm3, xmm0
movaps xmm4, xmm1
movaps xmm5, xmm2
andps xmm0, xmmword ptr mantissa_mask
andps xmm1, xmmword ptr mantissa_mask
andps xmm2, xmmword ptr mantissa_mask
orps xmm0, xmmword ptr cval_one
orps xmm1, xmmword ptr cval_one
orps xmm2, xmmword ptr cval_one
psrld xmm3, 23
psrld xmm4, 23
psrld xmm5, 23
psubd xmm3, xmmword ptr exponent_shft
psubd xmm4, xmmword ptr exponent_shft
psubd xmm5, xmmword ptr exponent_shft
cvtdq2ps xmm3, xmm3
cvtdq2ps xmm4, xmm4
cvtdq2ps xmm5, xmm5
movaps xmm6, xmm0
movaps xmm7, xmm1
movaps xmm8, xmm2
mulps xmm6, xmm6
mulps xmm7, xmm7
mulps xmm8, xmm8
mulps xmm6, xmm0
mulps xmm7, xmm1
mulps xmm8, xmm2
movaps xmm9 , xmm0
movaps xmm10, xmm1
movaps xmm11, xmm2
movaps xmm12, xmm0
movaps xmm13, xmm1
movaps xmm14, xmm2
mulps xmm0, xmmword ptr log2_c5
mulps xmm1, xmmword ptr log2_c5
mulps xmm2, xmmword ptr log2_c5
mulps xmm9 , xmmword ptr log2_c2
mulps xmm10, xmmword ptr log2_c2
mulps xmm11, xmmword ptr log2_c2
addps xmm0, xmmword ptr log2_c4
addps xmm1, xmmword ptr log2_c4
addps xmm2, xmmword ptr log2_c4
addps xmm9, xmmword ptr log2_c1
addps xmm10, xmmword ptr log2_c1
addps xmm11, xmmword ptr log2_c1
mulps xmm0, xmm12
mulps xmm1, xmm13
mulps xmm2, xmm14
mulps xmm9, xmm12
mulps xmm10, xmm13
mulps xmm11, xmm14
addps xmm0, xmmword ptr log2_c3
addps xmm1, xmmword ptr log2_c3
addps xmm2, xmmword ptr log2_c3
addps xmm9, xmmword ptr log2_c0
addps xmm10, xmmword ptr log2_c0
addps xmm11, xmmword ptr log2_c0
mulps xmm0, xmm6
mulps xmm1, xmm7
mulps xmm2, xmm8
subps xmm12, xmmword ptr cval_one
subps xmm13, xmmword ptr cval_one
subps xmm14, xmmword ptr cval_one
addps xmm0, xmm9
addps xmm1, xmm10
addps xmm2, xmm11
mulps xmm12, xmm0
mulps xmm13, xmm1
mulps xmm14, xmm2
addps xmm12, xmm3
addps xmm13, xmm4
addps xmm14, xmm5
movaps xmmword ptr[rcx + 0], xmm12
movaps xmmword ptr[rcx + 16], xmm13
movaps xmmword ptr[rcx + 32], xmm14
ret 0
_mm_log2_ps_packet12 endp
_mm_log2_ps_packet16 proc
align 16
movaps xmm0, xmmword ptr[rcx + 0]
movaps xmm1, xmmword ptr[rcx + 16]
movaps xmm2, xmmword ptr[rcx + 32]
movaps xmm3, xmmword ptr[rcx + 48]
movaps xmm4, xmm0
movaps xmm5, xmm1
movaps xmm6, xmm2
movaps xmm7, xmm3
andps xmm0, xmmword ptr mantissa_mask
andps xmm1, xmmword ptr mantissa_mask
andps xmm2, xmmword ptr mantissa_mask
andps xmm3, xmmword ptr mantissa_mask
psrld xmm4, 23
psrld xmm5, 23
psrld xmm6, 23
psrld xmm7, 23
orps xmm0, xmmword ptr cval_one
orps xmm1, xmmword ptr cval_one
orps xmm2, xmmword ptr cval_one
orps xmm3, xmmword ptr cval_one
psubd xmm4, xmmword ptr exponent_shft
psubd xmm5, xmmword ptr exponent_shft
psubd xmm6, xmmword ptr exponent_shft
psubd xmm7, xmmword ptr exponent_shft
cvtdq2ps xmm4, xmm4
cvtdq2ps xmm5, xmm5
cvtdq2ps xmm6, xmm6
cvtdq2ps xmm7, xmm7
movaps xmm8 , xmm0
movaps xmm9 , xmm1
movaps xmm10, xmm2
movaps xmm11, xmm3
movaps xmm12, xmm0
movaps xmm13, xmm1
movaps xmm14, xmm2
movaps xmm15, xmm3
mulps xmm8 , xmmword ptr log2_c5
mulps xmm9 , xmmword ptr log2_c5
mulps xmm10, xmmword ptr log2_c5
mulps xmm11, xmmword ptr log2_c5
mulps xmm12, xmmword ptr log2_c2
mulps xmm13, xmmword ptr log2_c2
mulps xmm14, xmmword ptr log2_c2
mulps xmm15, xmmword ptr log2_c2
addps xmm8 , xmmword ptr log2_c4
addps xmm9 , xmmword ptr log2_c4
addps xmm10, xmmword ptr log2_c4
addps xmm11, xmmword ptr log2_c4
addps xmm12, xmmword ptr log2_c1
addps xmm13, xmmword ptr log2_c1
addps xmm14, xmmword ptr log2_c1
addps xmm15, xmmword ptr log2_c1
mulps xmm8 , xmm0
mulps xmm9 , xmm1
mulps xmm10, xmm2
mulps xmm11, xmm3
mulps xmm12, xmm0
mulps xmm13, xmm1
mulps xmm14, xmm2
mulps xmm15, xmm3
addps xmm8 , xmmword ptr log2_c3
addps xmm9 , xmmword ptr log2_c3
addps xmm10, xmmword ptr log2_c3
addps xmm11, xmmword ptr log2_c3
addps xmm12, xmmword ptr log2_c0
addps xmm13, xmmword ptr log2_c0
addps xmm14, xmmword ptr log2_c0
addps xmm15, xmmword ptr log2_c0
mulps xmm8 , xmm0
mulps xmm9 , xmm1
mulps xmm10, xmm2
mulps xmm11, xmm3
mulps xmm8 , xmm0
mulps xmm9 , xmm1
mulps xmm10, xmm2
mulps xmm11, xmm3
mulps xmm8 , xmm0
mulps xmm9 , xmm1
mulps xmm10, xmm2
mulps xmm11, xmm3
subps xmm0, xmmword ptr cval_one
subps xmm1, xmmword ptr cval_one
subps xmm2, xmmword ptr cval_one
subps xmm3, xmmword ptr cval_one
addps xmm8 , xmm12
addps xmm9 , xmm13
addps xmm10, xmm14
addps xmm11, xmm15
mulps xmm0, xmm8
mulps xmm1, xmm9
mulps xmm2, xmm10
mulps xmm3, xmm11
addps xmm0, xmm4
addps xmm1, xmm5
addps xmm2, xmm6
addps xmm3, xmm7
movaps xmmword ptr[rcx + 0], xmm0
movaps xmmword ptr[rcx + 16], xmm1
movaps xmmword ptr[rcx + 32], xmm2
movaps xmmword ptr[rcx + 48], xmm3
ret 0
_mm_log2_ps_packet16 endp
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment