Skip to content

Instantly share code, notes, and snippets.

@yoffy
Last active October 6, 2017 00:27
Show Gist options
  • Select an option

  • Save yoffy/f6ec447b085c7cdd42751c2ddb2e8178 to your computer and use it in GitHub Desktop.

Select an option

Save yoffy/f6ec447b085c7cdd42751c2ddb2e8178 to your computer and use it in GitHub Desktop.
; clang++-4.0 -I/home/yoffy/src/libiqo/include -I/home/yoffy/src/libiqo/src/../include -Wall -Wextra -Wconversion -Wno-sign-conversion -std=c++98 -fno-exceptions -Wall -Wextra -Wconversion -Wno-sign-conversion -Ofast -march=core2 -msse4.1 -mtune=westmere -S -mllvm --x86-asm-syntax=intel /home/yoffy/src/libiqo/src/IQOAreaResizerImpl_SSE4_1.
cpp
.LBB5_2: # =>This Loop Header: Depth=1
# Child Loop BB5_4 Depth 2
test r14d, r14d
jle .LBB5_6
# BB#3: # in Loop: Header=BB5_2 Depth=1
movdqu xmm5, xmmword ptr [r8 + 4*r15]
mov rbp, r15
or rbp, 4
movdqu xmm6, xmmword ptr [r8 + 4*rbp]
mov rbp, r15
or rbp, 8
movdqu xmm7, xmmword ptr [r8 + 4*rbp]
mov rbp, r15
or rbp, 12
movdqu xmm0, xmmword ptr [r8 + 4*rbp]
mov rax, qword ptr [rsp - 48] # 8-byte Reload
lea r9, [rax + 4*rbx]
xorps xmm11, xmm11
mov r11d, r14d
xorps xmm12, xmm12
xorps xmm10, xmm10
xorps xmm9, xmm9
.p2align 4, 0x90
.LBB5_4: # Parent Loop BB5_2 Depth=1
# => This Inner Loop Header: Depth=2
movq rbp, xmm5
pextrq rdi, xmm5, 1
movsxd r12, ebp
sar rbp, 32
movsxd r13, edi
sar rdi, 32
movss xmm1, dword ptr [rsi + 4*r12] # xmm1 = mem[0],zero,zero,zero
insertps xmm1, dword ptr [rsi + 4*rbp], 16 # xmm1 = xmm1[0],mem[0],xmm1[2,3]
insertps xmm1, dword ptr [rsi + 4*r13], 32 # xmm1 = xmm1[0,1],mem[0],xmm1[3]
insertps xmm1, dword ptr [rsi + 4*rdi], 48 # xmm1 = xmm1[0,1,2],mem[0]
movq rdi, xmm6
pextrq rbp, xmm6, 1
movsxd rax, edi
sar rdi, 32
movsxd rdx, ebp
sar rbp, 32
movss xmm2, dword ptr [rsi + 4*rax] # xmm2 = mem[0],zero,zero,zero
insertps xmm2, dword ptr [rsi + 4*rdi], 16 # xmm2 = xmm2[0],mem[0],xmm2[2,3]
insertps xmm2, dword ptr [rsi + 4*rdx], 32 # xmm2 = xmm2[0,1],mem[0],xmm2[3]
insertps xmm2, dword ptr [rsi + 4*rbp], 48 # xmm2 = xmm2[0,1,2],mem[0]
movq rax, xmm7
pextrq rdx, xmm7, 1
movsxd rdi, eax
sar rax, 32
movsxd rbp, edx
sar rdx, 32
movss xmm3, dword ptr [rsi + 4*rdi] # xmm3 = mem[0],zero,zero,zero
insertps xmm3, dword ptr [rsi + 4*rax], 16 # xmm3 = xmm3[0],mem[0],xmm3[2,3]
insertps xmm3, dword ptr [rsi + 4*rbp], 32 # xmm3 = xmm3[0,1],mem[0],xmm3[3]
insertps xmm3, dword ptr [rsi + 4*rdx], 48 # xmm3 = xmm3[0,1,2],mem[0]
movq rax, xmm0
pextrq rdx, xmm0, 1
movsxd rdi, eax
sar rax, 32
movsxd rbp, edx
sar rdx, 32
movss xmm4, dword ptr [rsi + 4*rdi] # xmm4 = mem[0],zero,zero,zero
insertps xmm4, dword ptr [rsi + 4*rax], 16 # xmm4 = xmm4[0],mem[0],xmm4[2,3]
insertps xmm4, dword ptr [rsi + 4*rbp], 32 # xmm4 = xmm4[0,1],mem[0],xmm4[3]
insertps xmm4, dword ptr [rsi + 4*rdx], 48 # xmm4 = xmm4[0,1,2],mem[0]
mulps xmm1, xmmword ptr [r9 - 48]
mulps xmm2, xmmword ptr [r9 - 32]
mulps xmm3, xmmword ptr [r9 - 16]
mulps xmm4, xmmword ptr [r9]
addps xmm11, xmm1
addps xmm12, xmm2
addps xmm10, xmm3
addps xmm9, xmm4
paddd xmm5, xmm8
paddd xmm6, xmm8
paddd xmm7, xmm8
paddd xmm0, xmm8
add r9, 64
dec r11d
jne .LBB5_4
# BB#5: # in Loop: Header=BB5_2 Depth=1
add rbx, qword ptr [rsp - 40] # 8-byte Folded Reload
jmp .LBB5_7
.p2align 4, 0x90
.LBB5_6: # in Loop: Header=BB5_2 Depth=1
xorps xmm9, xmm9
xorps xmm10, xmm10
xorps xmm12, xmm12
xorps xmm11, xmm11
.LBB5_7: # in Loop: Header=BB5_2 Depth=1
roundps xmm0, xmm11, 8
roundps xmm1, xmm12, 8
cvtps2dq xmm0, xmm0
cvtps2dq xmm1, xmm1
packusdw xmm0, xmm1
packuswb xmm0, xmm0
roundps xmm1, xmm10, 8
roundps xmm2, xmm9, 8
cvtps2dq xmm1, xmm1
cvtps2dq xmm2, xmm2
packusdw xmm1, xmm2
packuswb xmm1, xmm1
punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0]
movdqu xmmword ptr [r10 + r15], xmm0
cmp rbx, qword ptr [rsp - 64] # 8-byte Folded Reload
mov ebp, 0
cmove rbx, rbp
add r15, 16
cmp r15, qword ptr [rsp - 24] # 8-byte Folded Reload
jl .LBB5_2
"C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.11.25503\bin\HostX86\x64\CL.exe" /c /IC:\Users\yoffy\src\libiqo\include /IC:\Users\yoffy\src\libiqo\src\..\include /Zi /nologo /W3 /WX- /Ox /Ob2 /D WIN32 /D _WINDOWS /D NDEBUG /D "CMAKE_INTDIR=\"Release\"" /D _MBCS /Gm- /EHsc /MT /GS- /fp:fast /Zc:wchar_t /Zc:forScope /Zc:inline /GR /Gd /TP /FAu C:\Users\yoffy\src\libiqo\src\IQOAreaResizerImpl_SSE4_1.cpp
$LL4@resizeXmai:
movaps xmm6, xmm5
movaps xmm7, xmm5
movaps xmm8, xmm5
movaps xmm9, xmm5
movdqu xmm11, XMMWORD PTR [rbp-32]
movdqu xmm12, XMMWORD PTR [rbp-16]
movdqu xmm13, XMMWORD PTR [rbp]
movdqu xmm14, XMMWORD PTR [rbp+16]
test r8d, r8d
jle $LN6@resizeXmai
movdqa xmm10, XMMWORD PTR __xmm@00000001000000010000000100000001
lea r13, QWORD PTR [r14+8]
mov eax, r8d
lea r13, QWORD PTR [rsi+r13*4]
shl rax, 4
add r14, rax
mov r15d, r8d
npad 1
$LL7@resizeXmai:
pextrq rdx, xmm11, 1
movq r8, xmm11
movq r10, xmm12
pextrq r9, xmm12, 1
pextrq r11, xmm13, 1
movq rbx, xmm13
mov rax, rdx
shr rax, 32 ; 00000020H
movq rsi, xmm14
movsxd rcx, eax
movdqa xmm1, xmm10
movsxd rax, edx
paddd xmm1, xmm12
pextrq rdi, xmm14, 1
movdqa xmm12, xmm1
movss xmm4, DWORD PTR [r12+rcx*4]
movss xmm0, DWORD PTR [r12+rax*4]
mov rax, r8
shr rax, 32 ; 00000020H
movsxd rcx, eax
movsxd rax, r8d
movss xmm2, DWORD PTR [r12+rax*4]
mov rax, r9
insertps xmm2, DWORD PTR [r12+rcx*4], 16
insertps xmm2, xmm0, 32 ; 00000020H
insertps xmm2, xmm4, 48 ; 00000030H; Line 375
mulps xmm2, XMMWORD PTR [r13-32]
shr rax, 32 ; 00000020H
movsxd rcx, eax
movsxd rax, r9d
addps xmm6, xmm2
movss xmm4, DWORD PTR [r12+rcx*4]
movss xmm0, DWORD PTR [r12+rax*4]
mov rax, r10
shr rax, 32 ; 00000020H
movsxd rcx, eax
movsxd rax, r10d
movss xmm2, DWORD PTR [r12+rax*4]
mov rax, r11
insertps xmm2, DWORD PTR [r12+rcx*4], 16
insertps xmm2, xmm0, 32 ; 00000020H
insertps xmm2, xmm4, 48 ; 00000030H
mulps xmm2, XMMWORD PTR [r13-16]
shr rax, 32 ; 00000020H
movsxd rcx, eax
movsxd rax, r11d
addps xmm7, xmm2
movss xmm4, DWORD PTR [r12+rcx*4]
movss xmm0, DWORD PTR [r12+rax*4]
mov rax, rbx
shr rax, 32 ; 00000020H
movsxd rcx, eax
movsxd rax, ebx
movss xmm2, DWORD PTR [r12+rax*4]
mov rax, rdi
insertps xmm2, DWORD PTR [r12+rcx*4], 16
insertps xmm2, xmm0, 32 ; 00000020H
insertps xmm2, xmm4, 48 ; 00000030H
mulps xmm2, XMMWORD PTR [r13]
shr rax, 32 ; 00000020H
movsxd rcx, eax
movsxd rax, edi
addps xmm8, xmm2
movss xmm4, DWORD PTR [r12+rcx*4]
movss xmm0, DWORD PTR [r12+rax*4]
mov rax, rsi
shr rax, 32 ; 00000020H
movsxd rcx, eax
movsxd rax, esi
movss xmm2, DWORD PTR [r12+rax*4]
insertps xmm2, DWORD PTR [r12+rcx*4], 16
insertps xmm2, xmm0, 32 ; 00000020H
movdqa xmm0, xmm10
paddd xmm0, xmm11
movdqa xmm11, xmm0
movdqa xmm0, xmm10
insertps xmm2, xmm4, 48 ; 00000030H
mulps xmm2, XMMWORD PTR [r13+16]
paddd xmm0, xmm13
movdqa xmm13, xmm0
addps xmm9, xmm2
movdqa xmm1, xmm10
add r13, 64 ; 00000040H
paddd xmm1, xmm14
movdqa xmm14, xmm1
sub r15, 1
jne $LL7@resizeXmai
mov r13, QWORD PTR tv3180[rsp]
mov rsi, QWORD PTR coefs$1$[rsp]
mov r8d, DWORD PTR numCoefsX$1$[rsp]
mov rcx, QWORD PTR tv3026[rsp]
$LN6@resizeXmai:
xor eax, eax
add rbp, 64 ; 00000040H
roundps xmm0, xmm7, 8
cvtps2dq xmm2, xmm0
roundps xmm1, xmm6, 8
cvtps2dq xmm4, xmm1
roundps xmm0, xmm9, 8
packusdw xmm4, xmm2
roundps xmm1, xmm8, 8
cvtps2dq xmm2, xmm0
cvtps2dq xmm3, xmm1
packuswb xmm4, xmm4
packusdw xmm3, xmm2
packuswb xmm3, xmm3
punpcklqdq xmm4, xmm3
movdqu XMMWORD PTR [r13], xmm4
add r13, 16
cmp r14, rcx
mov QWORD PTR tv3180[rsp], r13
cmovne rax, r14
sub QWORD PTR tv3017[rsp], 1
mov r14, rax
jne $LL4@resizeXmai
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment