yoffy · October 6, 2017 00:27
diff --git a/IQOAreaResizerImpl_SSE4_1.clang.s b/IQOAreaResizerImpl_SSE4_1.clang.s
 ; clang++-4.0 -I/home/yoffy/src/libiqo/include -I/home/yoffy/src/libiqo/src/../include -Wall -Wextra -Wconversion -Wno-sign-conversion -std=c++98 -fno-exceptions  -Wall -Wextra -Wconversion -Wno-sign-conversion  -Ofast   -march=core2 -msse4.1 -mtune=westmere -S -mllvm --x86-asm-syntax=intel /home/yoffy/src/libiqo/src/IQOAreaResizerImpl_SSE4_1.
 cpp

 .LBB5_2:                                # =>This Loop Header: Depth=1
                                        #     Child Loop BB5_4 Depth 2
 	test	r14d, r14d
 	jle	.LBB5_6
 # BB#3:                                 #   in Loop: Header=BB5_2 Depth=1
 	movdqu	xmm5, xmmword ptr [r8 + 4*r15]
 	mov	rbp, r15
 	or	rbp, 4
 	movdqu	xmm6, xmmword ptr [r8 + 4*rbp]
 	mov	rbp, r15
 	or	rbp, 8
 	movdqu	xmm7, xmmword ptr [r8 + 4*rbp]
 	mov	rbp, r15
 	or	rbp, 12
 	movdqu	xmm0, xmmword ptr [r8 + 4*rbp]
 	mov	rax, qword ptr [rsp - 48] # 8-byte Reload
 	lea	r9, [rax + 4*rbx]
 	xorps	xmm11, xmm11
 	mov	r11d, r14d
 	xorps	xmm12, xmm12
 	xorps	xmm10, xmm10
 	xorps	xmm9, xmm9
 	.p2align	4, 0x90
 .LBB5_4:                                #   Parent Loop BB5_2 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
 	movq	rbp, xmm5
 	pextrq	rdi, xmm5, 1
 	movsxd	r12, ebp
 	sar	rbp, 32
 	movsxd	r13, edi
 	sar	rdi, 32
 	movss	xmm1, dword ptr [rsi + 4*r12] # xmm1 = mem[0],zero,zero,zero
 	insertps	xmm1, dword ptr [rsi + 4*rbp], 16 # xmm1 = xmm1[0],mem[0],xmm1[2,3]
 	insertps	xmm1, dword ptr [rsi + 4*r13], 32 # xmm1 = xmm1[0,1],mem[0],xmm1[3]
 	insertps	xmm1, dword ptr [rsi + 4*rdi], 48 # xmm1 = xmm1[0,1,2],mem[0]
 	movq	rdi, xmm6
 	pextrq	rbp, xmm6, 1
 	movsxd	rax, edi
 	sar	rdi, 32
 	movsxd	rdx, ebp
 	sar	rbp, 32
 	movss	xmm2, dword ptr [rsi + 4*rax] # xmm2 = mem[0],zero,zero,zero
 	insertps	xmm2, dword ptr [rsi + 4*rdi], 16 # xmm2 = xmm2[0],mem[0],xmm2[2,3]
 	insertps	xmm2, dword ptr [rsi + 4*rdx], 32 # xmm2 = xmm2[0,1],mem[0],xmm2[3]
 	insertps	xmm2, dword ptr [rsi + 4*rbp], 48 # xmm2 = xmm2[0,1,2],mem[0]
 	movq	rax, xmm7
 	pextrq	rdx, xmm7, 1
 	movsxd	rdi, eax
 	sar	rax, 32
 	movsxd	rbp, edx
 	sar	rdx, 32
 	movss	xmm3, dword ptr [rsi + 4*rdi] # xmm3 = mem[0],zero,zero,zero
 	insertps	xmm3, dword ptr [rsi + 4*rax], 16 # xmm3 = xmm3[0],mem[0],xmm3[2,3]
 	insertps	xmm3, dword ptr [rsi + 4*rbp], 32 # xmm3 = xmm3[0,1],mem[0],xmm3[3]
 	insertps	xmm3, dword ptr [rsi + 4*rdx], 48 # xmm3 = xmm3[0,1,2],mem[0]
 	movq	rax, xmm0
 	pextrq	rdx, xmm0, 1
 	movsxd	rdi, eax
 	sar	rax, 32
 	movsxd	rbp, edx
 	sar	rdx, 32
 	movss	xmm4, dword ptr [rsi + 4*rdi] # xmm4 = mem[0],zero,zero,zero
 	insertps	xmm4, dword ptr [rsi + 4*rax], 16 # xmm4 = xmm4[0],mem[0],xmm4[2,3]
 	insertps	xmm4, dword ptr [rsi + 4*rbp], 32 # xmm4 = xmm4[0,1],mem[0],xmm4[3]
 	insertps	xmm4, dword ptr [rsi + 4*rdx], 48 # xmm4 = xmm4[0,1,2],mem[0]
 	mulps	xmm1, xmmword ptr [r9 - 48]
 	mulps	xmm2, xmmword ptr [r9 - 32]
 	mulps	xmm3, xmmword ptr [r9 - 16]
 	mulps	xmm4, xmmword ptr [r9]
 	addps	xmm11, xmm1
 	addps	xmm12, xmm2
 	addps	xmm10, xmm3
 	addps	xmm9, xmm4
 	paddd	xmm5, xmm8
 	paddd	xmm6, xmm8
 	paddd	xmm7, xmm8
 	paddd	xmm0, xmm8
 	add	r9, 64
 	dec	r11d
 	jne	.LBB5_4
 # BB#5:                                 #   in Loop: Header=BB5_2 Depth=1
 	add	rbx, qword ptr [rsp - 40] # 8-byte Folded Reload
 	jmp	.LBB5_7
 	.p2align	4, 0x90
 .LBB5_6:                                #   in Loop: Header=BB5_2 Depth=1
 	xorps	xmm9, xmm9
 	xorps	xmm10, xmm10
 	xorps	xmm12, xmm12
 	xorps	xmm11, xmm11
 .LBB5_7:                                #   in Loop: Header=BB5_2 Depth=1
 	roundps	xmm0, xmm11, 8
 	roundps	xmm1, xmm12, 8
 	cvtps2dq	xmm0, xmm0
 	cvtps2dq	xmm1, xmm1
 	packusdw	xmm0, xmm1
 	packuswb	xmm0, xmm0
 	roundps	xmm1, xmm10, 8
 	roundps	xmm2, xmm9, 8
 	cvtps2dq	xmm1, xmm1
 	cvtps2dq	xmm2, xmm2
 	packusdw	xmm1, xmm2
 	packuswb	xmm1, xmm1
 	punpcklqdq	xmm0, xmm1      # xmm0 = xmm0[0],xmm1[0]
 	movdqu	xmmword ptr [r10 + r15], xmm0
 	cmp	rbx, qword ptr [rsp - 64] # 8-byte Folded Reload
 	mov	ebp, 0
 	cmove	rbx, rbp
 	add	r15, 16
 	cmp	r15, qword ptr [rsp - 24] # 8-byte Folded Reload
 	jl	.LBB5_2
diff --git a/IQOAreaResizerImpl_SSE4_1.vc.asm b/IQOAreaResizerImpl_SSE4_1.vc.asm
 "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.11.25503\bin\HostX86\x64\CL.exe" /c /IC:\Users\yoffy\src\libiqo\include /IC:\Users\yoffy\src\libiqo\src\..\include /Zi /nologo /W3 /WX- /Ox /Ob2 /D WIN32 /D _WINDOWS /D NDEBUG /D "CMAKE_INTDIR=\"Release\"" /D _MBCS /Gm- /EHsc /MT /GS- /fp:fast /Zc:wchar_t /Zc:forScope /Zc:inline /GR /Gd /TP /FAu C:\Users\yoffy\src\libiqo\src\IQOAreaResizerImpl_SSE4_1.cpp

 $LL4@resizeXmai:
 	movaps	xmm6, xmm5
 	movaps	xmm7, xmm5
 	movaps	xmm8, xmm5
 	movaps	xmm9, xmm5
 	movdqu	xmm11, XMMWORD PTR [rbp-32]
 	movdqu	xmm12, XMMWORD PTR [rbp-16]
 	movdqu	xmm13, XMMWORD PTR [rbp]
 	movdqu	xmm14, XMMWORD PTR [rbp+16]
 	test	r8d, r8d
 	jle	$LN6@resizeXmai
 	movdqa	xmm10, XMMWORD PTR __xmm@00000001000000010000000100000001
 	lea	r13, QWORD PTR [r14+8]
 	mov	eax, r8d
 	lea	r13, QWORD PTR [rsi+r13*4]
 	shl	rax, 4
 	add	r14, rax
 	mov	r15d, r8d
 	npad	1
 $LL7@resizeXmai:
 	pextrq	rdx, xmm11, 1
 	movq	r8, xmm11
 	movq	r10, xmm12
 	pextrq	r9, xmm12, 1
 	pextrq	r11, xmm13, 1
 	movq	rbx, xmm13
 	mov	rax, rdx
 	shr	rax, 32					; 00000020H
 	movq	rsi, xmm14
 	movsxd	rcx, eax
 	movdqa	xmm1, xmm10
 	movsxd	rax, edx
 	paddd	xmm1, xmm12
 	pextrq	rdi, xmm14, 1
 	movdqa	xmm12, xmm1
 	movss	xmm4, DWORD PTR [r12+rcx*4]
 	movss	xmm0, DWORD PTR [r12+rax*4]
 	mov	rax, r8
 	shr	rax, 32					; 00000020H
 	movsxd	rcx, eax
 	movsxd	rax, r8d
 	movss	xmm2, DWORD PTR [r12+rax*4]
 	mov	rax, r9
 	insertps xmm2, DWORD PTR [r12+rcx*4], 16
 	insertps xmm2, xmm0, 32				; 00000020H
 	insertps xmm2, xmm4, 48				; 00000030H; Line 375
 	mulps	xmm2, XMMWORD PTR [r13-32]
 	shr	rax, 32					; 00000020H
 	movsxd	rcx, eax
 	movsxd	rax, r9d
 	addps	xmm6, xmm2
 	movss	xmm4, DWORD PTR [r12+rcx*4]
 	movss	xmm0, DWORD PTR [r12+rax*4]
 	mov	rax, r10
 	shr	rax, 32					; 00000020H
 	movsxd	rcx, eax
 	movsxd	rax, r10d
 	movss	xmm2, DWORD PTR [r12+rax*4]
 	mov	rax, r11
 	insertps xmm2, DWORD PTR [r12+rcx*4], 16
 	insertps xmm2, xmm0, 32				; 00000020H
 	insertps xmm2, xmm4, 48				; 00000030H
 	mulps	xmm2, XMMWORD PTR [r13-16]
 	shr	rax, 32					; 00000020H
 	movsxd	rcx, eax
 	movsxd	rax, r11d
 	addps	xmm7, xmm2
 	movss	xmm4, DWORD PTR [r12+rcx*4]
 	movss	xmm0, DWORD PTR [r12+rax*4]
 	mov	rax, rbx
 	shr	rax, 32					; 00000020H
 	movsxd	rcx, eax
 	movsxd	rax, ebx
 	movss	xmm2, DWORD PTR [r12+rax*4]
 	mov	rax, rdi
 	insertps xmm2, DWORD PTR [r12+rcx*4], 16
 	insertps xmm2, xmm0, 32				; 00000020H
 	insertps xmm2, xmm4, 48				; 00000030H
 	mulps	xmm2, XMMWORD PTR [r13]
 	shr	rax, 32					; 00000020H
 	movsxd	rcx, eax
 	movsxd	rax, edi
 	addps	xmm8, xmm2
 	movss	xmm4, DWORD PTR [r12+rcx*4]
 	movss	xmm0, DWORD PTR [r12+rax*4]
 	mov	rax, rsi
 	shr	rax, 32					; 00000020H
 	movsxd	rcx, eax
 	movsxd	rax, esi
 	movss	xmm2, DWORD PTR [r12+rax*4]
 	insertps xmm2, DWORD PTR [r12+rcx*4], 16
 	insertps xmm2, xmm0, 32				; 00000020H
 	movdqa	xmm0, xmm10
 	paddd	xmm0, xmm11
 	movdqa	xmm11, xmm0
 	movdqa	xmm0, xmm10
 	insertps xmm2, xmm4, 48				; 00000030H
 	mulps	xmm2, XMMWORD PTR [r13+16]
 	paddd	xmm0, xmm13
 	movdqa	xmm13, xmm0
 	addps	xmm9, xmm2
 	movdqa	xmm1, xmm10
 	add	r13, 64					; 00000040H
 	paddd	xmm1, xmm14
 	movdqa	xmm14, xmm1
 	sub	r15, 1
 	jne	$LL7@resizeXmai
 	mov	r13, QWORD PTR tv3180[rsp]
 	mov	rsi, QWORD PTR coefs$1$[rsp]
 	mov	r8d, DWORD PTR numCoefsX$1$[rsp]
 	mov	rcx, QWORD PTR tv3026[rsp]
 $LN6@resizeXmai:
 	xor	eax, eax
 	add	rbp, 64					; 00000040H
 	roundps	xmm0, xmm7, 8
 	cvtps2dq xmm2, xmm0
 	roundps	xmm1, xmm6, 8
 	cvtps2dq xmm4, xmm1
 	roundps	xmm0, xmm9, 8
 	packusdw xmm4, xmm2
 	roundps	xmm1, xmm8, 8
 	cvtps2dq xmm2, xmm0
 	cvtps2dq xmm3, xmm1
 	packuswb xmm4, xmm4
 	packusdw xmm3, xmm2
 	packuswb xmm3, xmm3
 	punpcklqdq xmm4, xmm3
 	movdqu	XMMWORD PTR [r13], xmm4
 	add	r13, 16
 	cmp	r14, rcx
 	mov	QWORD PTR tv3180[rsp], r13
 	cmovne	rax, r14
 	sub	QWORD PTR tv3017[rsp], 1
 	mov	r14, rax
 	jne	$LL4@resizeXmai
	; clang++-4.0 -I/home/yoffy/src/libiqo/include -I/home/yoffy/src/libiqo/src/../include -Wall -Wextra -Wconversion -Wno-sign-conversion -std=c++98 -fno-exceptions -Wall -Wextra -Wconversion -Wno-sign-conversion -Ofast -march=core2 -msse4.1 -mtune=westmere -S -mllvm --x86-asm-syntax=intel /home/yoffy/src/libiqo/src/IQOAreaResizerImpl_SSE4_1.
	cpp

	.LBB5_2: # =>This Loop Header: Depth=1
	# Child Loop BB5_4 Depth 2
	test r14d, r14d
	jle .LBB5_6
	# BB#3: # in Loop: Header=BB5_2 Depth=1
	movdqu xmm5, xmmword ptr [r8 + 4*r15]
	mov rbp, r15
	or rbp, 4
	movdqu xmm6, xmmword ptr [r8 + 4*rbp]
	mov rbp, r15
	or rbp, 8
	movdqu xmm7, xmmword ptr [r8 + 4*rbp]
	mov rbp, r15
	or rbp, 12
	movdqu xmm0, xmmword ptr [r8 + 4*rbp]
	mov rax, qword ptr [rsp - 48] # 8-byte Reload
	lea r9, [rax + 4*rbx]
	xorps xmm11, xmm11
	mov r11d, r14d
	xorps xmm12, xmm12
	xorps xmm10, xmm10
	xorps xmm9, xmm9
	.p2align 4, 0x90
	.LBB5_4: # Parent Loop BB5_2 Depth=1
	# => This Inner Loop Header: Depth=2
	movq rbp, xmm5
	pextrq rdi, xmm5, 1
	movsxd r12, ebp
	sar rbp, 32
	movsxd r13, edi
	sar rdi, 32
	movss xmm1, dword ptr [rsi + 4*r12] # xmm1 = mem[0],zero,zero,zero
	insertps xmm1, dword ptr [rsi + 4*rbp], 16 # xmm1 = xmm1[0],mem[0],xmm1[2,3]
	insertps xmm1, dword ptr [rsi + 4*r13], 32 # xmm1 = xmm1[0,1],mem[0],xmm1[3]
	insertps xmm1, dword ptr [rsi + 4*rdi], 48 # xmm1 = xmm1[0,1,2],mem[0]
	movq rdi, xmm6
	pextrq rbp, xmm6, 1
	movsxd rax, edi
	sar rdi, 32
	movsxd rdx, ebp
	sar rbp, 32
	movss xmm2, dword ptr [rsi + 4*rax] # xmm2 = mem[0],zero,zero,zero
	insertps xmm2, dword ptr [rsi + 4*rdi], 16 # xmm2 = xmm2[0],mem[0],xmm2[2,3]
	insertps xmm2, dword ptr [rsi + 4*rdx], 32 # xmm2 = xmm2[0,1],mem[0],xmm2[3]
	insertps xmm2, dword ptr [rsi + 4*rbp], 48 # xmm2 = xmm2[0,1,2],mem[0]
	movq rax, xmm7
	pextrq rdx, xmm7, 1
	movsxd rdi, eax
	sar rax, 32
	movsxd rbp, edx
	sar rdx, 32
	movss xmm3, dword ptr [rsi + 4*rdi] # xmm3 = mem[0],zero,zero,zero
	insertps xmm3, dword ptr [rsi + 4*rax], 16 # xmm3 = xmm3[0],mem[0],xmm3[2,3]
	insertps xmm3, dword ptr [rsi + 4*rbp], 32 # xmm3 = xmm3[0,1],mem[0],xmm3[3]
	insertps xmm3, dword ptr [rsi + 4*rdx], 48 # xmm3 = xmm3[0,1,2],mem[0]
	movq rax, xmm0
	pextrq rdx, xmm0, 1
	movsxd rdi, eax
	sar rax, 32
	movsxd rbp, edx
	sar rdx, 32
	movss xmm4, dword ptr [rsi + 4*rdi] # xmm4 = mem[0],zero,zero,zero
	insertps xmm4, dword ptr [rsi + 4*rax], 16 # xmm4 = xmm4[0],mem[0],xmm4[2,3]
	insertps xmm4, dword ptr [rsi + 4*rbp], 32 # xmm4 = xmm4[0,1],mem[0],xmm4[3]
	insertps xmm4, dword ptr [rsi + 4*rdx], 48 # xmm4 = xmm4[0,1,2],mem[0]
	mulps xmm1, xmmword ptr [r9 - 48]
	mulps xmm2, xmmword ptr [r9 - 32]
	mulps xmm3, xmmword ptr [r9 - 16]
	mulps xmm4, xmmword ptr [r9]
	addps xmm11, xmm1
	addps xmm12, xmm2
	addps xmm10, xmm3
	addps xmm9, xmm4
	paddd xmm5, xmm8
	paddd xmm6, xmm8
	paddd xmm7, xmm8
	paddd xmm0, xmm8
	add r9, 64
	dec r11d
	jne .LBB5_4
	# BB#5: # in Loop: Header=BB5_2 Depth=1
	add rbx, qword ptr [rsp - 40] # 8-byte Folded Reload
	jmp .LBB5_7
	.p2align 4, 0x90
	.LBB5_6: # in Loop: Header=BB5_2 Depth=1
	xorps xmm9, xmm9
	xorps xmm10, xmm10
	xorps xmm12, xmm12
	xorps xmm11, xmm11
	.LBB5_7: # in Loop: Header=BB5_2 Depth=1
	roundps xmm0, xmm11, 8
	roundps xmm1, xmm12, 8
	cvtps2dq xmm0, xmm0
	cvtps2dq xmm1, xmm1
	packusdw xmm0, xmm1
	packuswb xmm0, xmm0
	roundps xmm1, xmm10, 8
	roundps xmm2, xmm9, 8
	cvtps2dq xmm1, xmm1
	cvtps2dq xmm2, xmm2
	packusdw xmm1, xmm2
	packuswb xmm1, xmm1
	punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0]
	movdqu xmmword ptr [r10 + r15], xmm0
	cmp rbx, qword ptr [rsp - 64] # 8-byte Folded Reload
	mov ebp, 0
	cmove rbx, rbp
	add r15, 16
	cmp r15, qword ptr [rsp - 24] # 8-byte Folded Reload
	jl .LBB5_2
	"C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.11.25503\bin\HostX86\x64\CL.exe" /c /IC:\Users\yoffy\src\libiqo\include /IC:\Users\yoffy\src\libiqo\src\..\include /Zi /nologo /W3 /WX- /Ox /Ob2 /D WIN32 /D _WINDOWS /D NDEBUG /D "CMAKE_INTDIR=\"Release\"" /D _MBCS /Gm- /EHsc /MT /GS- /fp:fast /Zc:wchar_t /Zc:forScope /Zc:inline /GR /Gd /TP /FAu C:\Users\yoffy\src\libiqo\src\IQOAreaResizerImpl_SSE4_1.cpp

	$LL4@resizeXmai:
	movaps xmm6, xmm5
	movaps xmm7, xmm5
	movaps xmm8, xmm5
	movaps xmm9, xmm5
	movdqu xmm11, XMMWORD PTR [rbp-32]
	movdqu xmm12, XMMWORD PTR [rbp-16]
	movdqu xmm13, XMMWORD PTR [rbp]
	movdqu xmm14, XMMWORD PTR [rbp+16]
	test r8d, r8d
	jle $LN6@resizeXmai
	movdqa xmm10, XMMWORD PTR __xmm@00000001000000010000000100000001
	lea r13, QWORD PTR [r14+8]
	mov eax, r8d
	lea r13, QWORD PTR [rsi+r13*4]
	shl rax, 4
	add r14, rax
	mov r15d, r8d
	npad 1
	$LL7@resizeXmai:
	pextrq rdx, xmm11, 1
	movq r8, xmm11
	movq r10, xmm12
	pextrq r9, xmm12, 1
	pextrq r11, xmm13, 1
	movq rbx, xmm13
	mov rax, rdx
	shr rax, 32 ; 00000020H
	movq rsi, xmm14
	movsxd rcx, eax
	movdqa xmm1, xmm10
	movsxd rax, edx
	paddd xmm1, xmm12
	pextrq rdi, xmm14, 1
	movdqa xmm12, xmm1
	movss xmm4, DWORD PTR [r12+rcx*4]
	movss xmm0, DWORD PTR [r12+rax*4]
	mov rax, r8
	shr rax, 32 ; 00000020H
	movsxd rcx, eax
	movsxd rax, r8d
	movss xmm2, DWORD PTR [r12+rax*4]
	mov rax, r9
	insertps xmm2, DWORD PTR [r12+rcx*4], 16
	insertps xmm2, xmm0, 32 ; 00000020H
	insertps xmm2, xmm4, 48 ; 00000030H; Line 375
	mulps xmm2, XMMWORD PTR [r13-32]
	shr rax, 32 ; 00000020H
	movsxd rcx, eax
	movsxd rax, r9d
	addps xmm6, xmm2
	movss xmm4, DWORD PTR [r12+rcx*4]
	movss xmm0, DWORD PTR [r12+rax*4]
	mov rax, r10
	shr rax, 32 ; 00000020H
	movsxd rcx, eax
	movsxd rax, r10d
	movss xmm2, DWORD PTR [r12+rax*4]
	mov rax, r11
	insertps xmm2, DWORD PTR [r12+rcx*4], 16
	insertps xmm2, xmm0, 32 ; 00000020H
	insertps xmm2, xmm4, 48 ; 00000030H
	mulps xmm2, XMMWORD PTR [r13-16]
	shr rax, 32 ; 00000020H
	movsxd rcx, eax
	movsxd rax, r11d
	addps xmm7, xmm2
	movss xmm4, DWORD PTR [r12+rcx*4]
	movss xmm0, DWORD PTR [r12+rax*4]
	mov rax, rbx
	shr rax, 32 ; 00000020H
	movsxd rcx, eax
	movsxd rax, ebx
	movss xmm2, DWORD PTR [r12+rax*4]
	mov rax, rdi
	insertps xmm2, DWORD PTR [r12+rcx*4], 16
	insertps xmm2, xmm0, 32 ; 00000020H
	insertps xmm2, xmm4, 48 ; 00000030H
	mulps xmm2, XMMWORD PTR [r13]
	shr rax, 32 ; 00000020H
	movsxd rcx, eax
	movsxd rax, edi
	addps xmm8, xmm2
	movss xmm4, DWORD PTR [r12+rcx*4]
	movss xmm0, DWORD PTR [r12+rax*4]
	mov rax, rsi
	shr rax, 32 ; 00000020H
	movsxd rcx, eax
	movsxd rax, esi
	movss xmm2, DWORD PTR [r12+rax*4]
	insertps xmm2, DWORD PTR [r12+rcx*4], 16
	insertps xmm2, xmm0, 32 ; 00000020H
	movdqa xmm0, xmm10
	paddd xmm0, xmm11
	movdqa xmm11, xmm0
	movdqa xmm0, xmm10
	insertps xmm2, xmm4, 48 ; 00000030H
	mulps xmm2, XMMWORD PTR [r13+16]
	paddd xmm0, xmm13
	movdqa xmm13, xmm0
	addps xmm9, xmm2
	movdqa xmm1, xmm10
	add r13, 64 ; 00000040H
	paddd xmm1, xmm14
	movdqa xmm14, xmm1
	sub r15, 1
	jne $LL7@resizeXmai
	mov r13, QWORD PTR tv3180[rsp]
	mov rsi, QWORD PTR coefs$1$[rsp]
	mov r8d, DWORD PTR numCoefsX$1$[rsp]
	mov rcx, QWORD PTR tv3026[rsp]
	$LN6@resizeXmai:
	xor eax, eax
	add rbp, 64 ; 00000040H
	roundps xmm0, xmm7, 8
	cvtps2dq xmm2, xmm0
	roundps xmm1, xmm6, 8
	cvtps2dq xmm4, xmm1
	roundps xmm0, xmm9, 8
	packusdw xmm4, xmm2
	roundps xmm1, xmm8, 8
	cvtps2dq xmm2, xmm0
	cvtps2dq xmm3, xmm1
	packuswb xmm4, xmm4
	packusdw xmm3, xmm2
	packuswb xmm3, xmm3
	punpcklqdq xmm4, xmm3
	movdqu XMMWORD PTR [r13], xmm4
	add r13, 16
	cmp r14, rcx
	mov QWORD PTR tv3180[rsp], r13
	cmovne rax, r14
	sub QWORD PTR tv3017[rsp], 1
	mov r14, rax
	jne $LL4@resizeXmai