Skip to content

Instantly share code, notes, and snippets.

@yoffy
Last active October 6, 2017 00:29
Show Gist options
  • Select an option

  • Save yoffy/72e2f6fedca8de5651a5be48ed723393 to your computer and use it in GitHub Desktop.

Select an option

Save yoffy/72e2f6fedca8de5651a5be48ed723393 to your computer and use it in GitHub Desktop.
# clang++-4.0 -I/home/yoffy/src/libiqo/include -I/home/yoffy/src/libiqo/src/../include -Wall -Wextra -Wconversion -Wno-sign-conversion -std=c++98 -fno-exceptions -Wall -Wextra -Wconversion -Wno-sign-conversion -Ofast -march=core2 -msse4.1 -mtune=westmere -S -mllvm --x86-asm-syntax=intel /home/yoffy/src/libiqo/src/IQOLanczosResizerImpl_SSE4_1.cpp
.LBB7_11: # =>This Loop Header: Depth=1
# Child Loop BB7_12 Depth 2
mov rax, qword ptr [rsp - 16] # 8-byte Reload
movdqu xmm1, xmmword ptr [rax + 4*r15]
movdqu xmm2, xmmword ptr [rax + 4*r15 + 16]
mov rax, qword ptr [rsp - 8] # 8-byte Reload
lea rdi, [rax + 4*r12]
xorps xmm0, xmm0
mov ebp, dword ptr [rsp - 36] # 4-byte Reload
mov r11, rbx
mov r13d, ebx
xorps xmm3, xmm3
.p2align 4, 0x90
.LBB7_12: # Parent Loop BB7_11 Depth=1
# => This Inner Loop Header: Depth=2
movd xmm4, ebp
pshufd xmm4, xmm4, 0 # xmm4 = xmm4[0,0,0,0]
movdqa xmm5, xmm4
paddd xmm5, xmm1
paddd xmm4, xmm2
movq r8, xmm5
pextrq r9, xmm5, 1
movsxd r10, r8d
sar r8, 32
movsxd r14, r9d
sar r9, 32
movss xmm5, dword ptr [rsi + 4*r10] # xmm5 = mem[0],zero,zero,zero
insertps xmm5, dword ptr [rsi + 4*r8], 16 # xmm5 = xmm5[0],mem[0],xmm5[2,3]
insertps xmm5, dword ptr [rsi + 4*r14], 32 # xmm5 = xmm5[0,1],mem[0],xmm5[3]
insertps xmm5, dword ptr [rsi + 4*r9], 48 # xmm5 = xmm5[0,1,2],mem[0]
movq rbx, xmm4
pextrq rcx, xmm4, 1
movsxd rdx, ebx
sar rbx, 32
movsxd rax, ecx
sar rcx, 32
movss xmm4, dword ptr [rsi + 4*rdx] # xmm4 = mem[0],zero,zero,zero
insertps xmm4, dword ptr [rsi + 4*rbx], 16 # xmm4 = xmm4[0],mem[0],xmm4[2,3]
insertps xmm4, dword ptr [rsi + 4*rax], 32 # xmm4 = xmm4[0,1],mem[0],xmm4[3]
insertps xmm4, dword ptr [rsi + 4*rcx], 48 # xmm4 = xmm4[0,1,2],mem[0]
mulps xmm5, xmmword ptr [rdi - 16]
mulps xmm4, xmmword ptr [rdi]
addps xmm0, xmm5
addps xmm3, xmm4
add rdi, 32
inc ebp
dec r13d
jne .LBB7_12
# BB#13: # in Loop: Header=BB7_11 Depth=1
add r12, qword ptr [rsp - 24] # 8-byte Folded Reload
roundps xmm0, xmm0, 8
roundps xmm1, xmm3, 8
cvtps2dq xmm0, xmm0
cvtps2dq xmm1, xmm1
packusdw xmm0, xmm1
packuswb xmm0, xmm0
mov rax, qword ptr [rsp - 48] # 8-byte Reload
movq qword ptr [rax + r15], xmm0
cmp r12, qword ptr [rsp - 56] # 8-byte Folded Reload
mov eax, 0
cmove r12, rax
add r15, 8
cmp r15, qword ptr [rsp - 32] # 8-byte Folded Reload
mov rbx, r11
jl .LBB7_11
; "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.11.25503\bin\HostX86\x64\CL.exe" /c /IC:\Users\yoffy\src\libiqo\include /IC:\Users\yoffy\src\libiqo\src\..\include /Zi /nologo /W3 /WX- /Ox /Ob2 /D WIN32 /D _WINDOWS /D NDEBUG /D "CMAKE_INTDIR=\"Release\"" /D _MBCS /Gm- /EHsc /MT /GS- /fp:fast /Zc:wchar_t /Zc:forScope /Zc:inline /GR /Gd /TP /FAsu C:\Users\yoffy\src\libiqo\src\IQOLanczosResizerImpl_SSE4_1.cpp
$LL4@resizeXmai:
movdqu xmm8, XMMWORD PTR [r11]
xor ebp, ebp
movaps xmm5, xmm7
movaps xmm6, xmm7
movdqu xmm9, XMMWORD PTR [r11+rax]
test ebx, ebx
jle $LN6@resizeXmai
lea rsi, QWORD PTR [r12+rcx*4]
mov eax, ebx
cdq
lea rcx, QWORD PTR [rcx+rbx*8]
sub eax, edx
sar eax, 1
neg eax
mov r12d, eax
npad 1
$LL7@resizeXmai:
lea edx, DWORD PTR [r12+rbp]
inc ebp
movd xmm1, edx
pshufd xmm1, xmm1, 0
movdqa xmm0, xmm1
paddd xmm1, xmm9
paddd xmm0, xmm8
movq r10, xmm1
pextrq rdx, xmm0, 1
movq r8, xmm0
mov rax, rdx
shr rax, 32 ; 00000020H
cdqe
pextrq r9, xmm1, 1
movss xmm4, DWORD PTR [r14+rax*4]
movsxd rax, edx
movss xmm3, DWORD PTR [r14+rax*4]
mov rax, r8
shr rax, 32 ; 00000020H
cdqe
movss xmm0, DWORD PTR [r14+rax*4]
movsxd rax, r8d
movss xmm2, DWORD PTR [r14+rax*4]
mov rax, r9
insertps xmm2, xmm0, 16
insertps xmm2, xmm3, 32 ; 00000020H
shr rax, 32 ; 00000020H
insertps xmm2, xmm4, 48 ; 00000030H
mulps xmm2, XMMWORD PTR [rsi]
cdqe
movss xmm4, DWORD PTR [r14+rax*4]
addps xmm5, xmm2
movsxd rax, r9d
movss xmm3, DWORD PTR [r14+rax*4]
mov rax, r10
shr rax, 32 ; 00000020H
cdqe
movss xmm0, DWORD PTR [r14+rax*4]
movsxd rax, r10d
movss xmm2, DWORD PTR [r14+rax*4]
insertps xmm2, xmm0, 16
insertps xmm2, xmm3, 32 ; 00000020H
insertps xmm2, xmm4, 48 ; 00000030H
mulps xmm2, XMMWORD PTR [rsi+16]
add rsi, 32 ; 00000020H
addps xmm6, xmm2
cmp ebp, ebx
jl $LL7@resizeXmai
mov r12, QWORD PTR coefs$1$[rsp]
$LN6@resizeXmai:
xor eax, eax
add r11, 32 ; 00000020H
roundps xmm0, xmm6, 8
roundps xmm1, xmm5, 8
cvtps2dq xmm2, xmm0
cvtps2dq xmm3, xmm1
packusdw xmm3, xmm2
packuswb xmm3, xmm3
movq QWORD PTR [rdi], xmm3
add rdi, 8
cmp rcx, r13
cmovne rax, rcx
mov rcx, rax
mov rax, QWORD PTR tv965[rsp]
sub r15, 1
jne $LL4@resizeXmai
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment