Last active
February 1, 2021 23:19
-
-
Save haampie/f6d0fc711d83d3d66506f666ee5c839b to your computer and use it in GitHub Desktop.
Tiny Transpose {5,6,7,8} x 8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
vmovups ymm0, ymmword ptr [rsi] | |
vmovups ymm1, ymmword ptr [rsi + 32] | |
vmovups ymm2, ymmword ptr [rsi + 64] | |
vmovups ymm6, ymmword ptr [rsi + 96] | |
vmovups ymm7, ymmword ptr [rsi + 128] | |
movabs rcx, offset .rodata.cst32 | |
mov rax, rdi | |
vmovaps ymm10, ymmword ptr [rcx] | |
vperm2f128 ymm3, ymm0, ymm1, 33 # ymm3 = ymm0[2,3],ymm1[0,1] | |
vperm2f128 ymm5, ymm1, ymm2, 33 # ymm5 = ymm1[2,3],ymm2[0,1] | |
vperm2f128 ymm8, ymm6, ymm7, 33 # ymm8 = ymm6[2,3],ymm7[0,1] | |
vshufps ymm4, ymm3, ymm1, 41 # ymm4 = ymm3[1,2],ymm1[2,0],ymm3[5,6],ymm1[6,4] | |
vshufpd ymm1, ymm1, ymm5, 5 # ymm1 = ymm1[1],ymm5[0],ymm1[3],ymm5[2] | |
vshufps ymm5, ymm5, ymm2, 3 # ymm5 = ymm5[3,0],ymm2[0,0],ymm5[7,4],ymm2[4,4] | |
vshufps ymm9, ymm6, ymm8, 41 # ymm9 = ymm6[1,2],ymm8[2,0],ymm6[5,6],ymm8[6,4] | |
vshufpd ymm8, ymm8, ymm7, 5 # ymm8 = ymm8[1],ymm7[0],ymm8[3],ymm7[2] | |
vpermps ymm7, ymm10, ymm7 | |
vblendps ymm0, ymm0, ymm3, 170 # ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] | |
vmovaps xmm3, xmmword ptr [rsi] | |
vshufps ymm5, ymm5, ymm2, 152 # ymm5 = ymm5[0,2],ymm2[1,2],ymm5[4,6],ymm2[5,6] | |
vperm2f128 ymm2, ymm2, ymm6, 33 # ymm2 = ymm2[2,3],ymm6[0,1] | |
vshufps xmm3, xmm3, xmm4, 221 # xmm3 = xmm3[1,3],xmm4[1,3] | |
vshufps ymm4, ymm1, ymm5, 136 # ymm4 = ymm1[0,2],ymm5[0,2],ymm1[4,6],ymm5[4,6] | |
vshufps xmm1, xmm1, xmm5, 221 # xmm1 = xmm1[1,3],xmm5[1,3] | |
vmovaps xmm5, xmmword ptr [rsi + 80] | |
vpermilps ymm4, ymm4, 216 # ymm4 = ymm4[0,2,1,3,4,6,5,7] | |
vshufps xmm5, xmm5, xmm9, 221 # xmm5 = xmm5[1,3],xmm9[1,3] | |
vblendps ymm2, ymm2, ymm6, 170 # ymm2 = ymm2[0],ymm6[1],ymm2[2],ymm6[3],ymm2[4],ymm6[5],ymm2[6],ymm6[7] | |
vshufps ymm6, ymm8, ymm7, 136 # ymm6 = ymm8[0,2],ymm7[0,2],ymm8[4,6],ymm7[4,6] | |
vshufps xmm7, xmm8, xmm7, 221 # xmm7 = xmm8[1,3],xmm7[1,3] | |
vunpcklpd ymm8, ymm0, ymm4 # ymm8 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] | |
vunpckhpd xmm0, xmm0, xmm4 # xmm0 = xmm0[1],xmm4[1] | |
vshufps xmm4, xmm3, xmm1, 136 # xmm4 = xmm3[0,2],xmm1[0,2] | |
vshufps xmm1, xmm3, xmm1, 221 # xmm1 = xmm3[1,3],xmm1[1,3] | |
vpermilps ymm6, ymm6, 216 # ymm6 = ymm6[0,2,1,3,4,6,5,7] | |
vunpcklpd ymm3, ymm2, ymm6 # ymm3 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] | |
vunpckhpd xmm2, xmm2, xmm6 # xmm2 = xmm2[1],xmm6[1] | |
vshufps xmm6, xmm5, xmm7, 136 # xmm6 = xmm5[0,2],xmm7[0,2] | |
vshufps xmm5, xmm5, xmm7, 221 # xmm5 = xmm5[1,3],xmm7[1,3] | |
vinsertf128 ymm7, ymm8, xmm3, 1 | |
vperm2f128 ymm3, ymm8, ymm3, 49 # ymm3 = ymm8[2,3],ymm3[2,3] | |
vmovaps ymmword ptr [rdi], ymm7 | |
vmovaps xmmword ptr [rdi + 48], xmm6 | |
vmovaps xmmword ptr [rdi + 32], xmm4 | |
vmovaps xmmword ptr [rdi + 80], xmm2 | |
vmovaps xmmword ptr [rdi + 64], xmm0 | |
vmovaps xmmword ptr [rdi + 112], xmm5 | |
vmovaps xmmword ptr [rdi + 96], xmm1 | |
vmovaps ymmword ptr [rdi + 128], ymm3 | |
vzeroupper | |
ret | |
nop word ptr cs:[rax + rax] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
vmovupd ymm0, ymmword ptr [rsi] | |
vmovupd ymm1, ymmword ptr [rsi + 32] | |
vmovupd ymm3, ymmword ptr [rsi + 64] | |
vmovupd ymm4, ymmword ptr [rsi + 96] | |
vmovupd ymm7, ymmword ptr [rsi + 160] | |
mov rax, rdi | |
vperm2f128 ymm2, ymm0, ymm1, 33 # ymm2 = ymm0[2,3],ymm1[0,1] | |
vperm2f128 ymm5, ymm3, ymm4, 33 # ymm5 = ymm3[2,3],ymm4[0,1] | |
vshufpd ymm2, ymm2, ymm1, 5 # ymm2 = ymm2[1],ymm1[0],ymm2[3],ymm1[2] | |
vperm2f128 ymm1, ymm1, ymm3, 33 # ymm1 = ymm1[2,3],ymm3[0,1] | |
vshufpd ymm3, ymm3, ymm5, 5 # ymm3 = ymm3[1],ymm5[0],ymm3[3],ymm5[2] | |
vmovupd ymm5, ymmword ptr [rsi + 128] | |
vshufps ymm8, ymm0, ymm2, 136 # ymm8 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] | |
vshufps ymm0, ymm0, ymm2, 221 # ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] | |
vpermilps ymm8, ymm8, 216 # ymm8 = ymm8[0,2,1,3,4,6,5,7] | |
vpermilps ymm0, ymm0, 216 # ymm0 = ymm0[0,2,1,3,4,6,5,7] | |
vperm2f128 ymm6, ymm4, ymm5, 33 # ymm6 = ymm4[2,3],ymm5[0,1] | |
vshufps ymm2, ymm1, ymm3, 136 # ymm2 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6] | |
vshufps ymm1, ymm1, ymm3, 221 # ymm1 = ymm1[1,3],ymm3[1,3],ymm1[5,7],ymm3[5,7] | |
vpermilps ymm2, ymm2, 216 # ymm2 = ymm2[0,2,1,3,4,6,5,7] | |
vpermilps ymm1, ymm1, 216 # ymm1 = ymm1[0,2,1,3,4,6,5,7] | |
vshufpd ymm6, ymm6, ymm5, 5 # ymm6 = ymm6[1],ymm5[0],ymm6[3],ymm5[2] | |
vperm2f128 ymm5, ymm5, ymm7, 33 # ymm5 = ymm5[2,3],ymm7[0,1] | |
vpermpd ymm7, ymm7, 57 # ymm7 = ymm7[1,2,3,0] | |
vshufps ymm3, ymm4, ymm6, 136 # ymm3 = ymm4[0,2],ymm6[0,2],ymm4[4,6],ymm6[4,6] | |
vshufps ymm4, ymm4, ymm6, 221 # ymm4 = ymm4[1,3],ymm6[1,3],ymm4[5,7],ymm6[5,7] | |
vpermilps ymm3, ymm3, 216 # ymm3 = ymm3[0,2,1,3,4,6,5,7] | |
vpermilps ymm4, ymm4, 216 # ymm4 = ymm4[0,2,1,3,4,6,5,7] | |
vshufps ymm6, ymm5, ymm7, 136 # ymm6 = ymm5[0,2],ymm7[0,2],ymm5[4,6],ymm7[4,6] | |
vshufps ymm5, ymm5, ymm7, 221 # ymm5 = ymm5[1,3],ymm7[1,3],ymm5[5,7],ymm7[5,7] | |
vunpcklpd ymm7, ymm8, ymm2 # ymm7 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] | |
vunpckhpd xmm2, xmm8, xmm2 # xmm2 = xmm8[1],xmm2[1] | |
vunpcklpd ymm8, ymm0, ymm1 # ymm8 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] | |
vunpckhpd xmm0, xmm0, xmm1 # xmm0 = xmm0[1],xmm1[1] | |
vpermilps ymm6, ymm6, 216 # ymm6 = ymm6[0,2,1,3,4,6,5,7] | |
vpermilps ymm5, ymm5, 216 # ymm5 = ymm5[0,2,1,3,4,6,5,7] | |
vunpcklpd ymm1, ymm3, ymm6 # ymm1 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] | |
vunpckhpd xmm3, xmm3, xmm6 # xmm3 = xmm3[1],xmm6[1] | |
vunpcklpd ymm6, ymm4, ymm5 # ymm6 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] | |
vunpckhpd xmm4, xmm4, xmm5 # xmm4 = xmm4[1],xmm5[1] | |
vinsertf128 ymm5, ymm7, xmm1, 1 | |
vperm2f128 ymm1, ymm7, ymm1, 49 # ymm1 = ymm7[2,3],ymm1[2,3] | |
vinsertf128 ymm7, ymm8, xmm6, 1 | |
vperm2f128 ymm6, ymm8, ymm6, 49 # ymm6 = ymm8[2,3],ymm6[2,3] | |
vmovaps ymmword ptr [rdi], ymm5 | |
vmovaps ymmword ptr [rdi + 32], ymm7 | |
vmovaps xmmword ptr [rdi + 80], xmm3 | |
vmovaps xmmword ptr [rdi + 64], xmm2 | |
vmovaps xmmword ptr [rdi + 112], xmm4 | |
vmovaps xmmword ptr [rdi + 96], xmm0 | |
vmovaps ymmword ptr [rdi + 128], ymm1 | |
vmovaps ymmword ptr [rdi + 160], ymm6 | |
vzeroupper | |
ret | |
nop word ptr [rax + rax] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
vmovupd ymm1, ymmword ptr [rsi + 32] | |
vmovupd ymm3, ymmword ptr [rsi + 64] | |
vmovupd ymm8, ymmword ptr [rsi + 160] | |
vmovupd ymm10, ymmword ptr [rsi + 192] | |
vmovupd ymm5, ymmword ptr [rsi + 96] | |
vmovupd ymm7, ymmword ptr [rsi + 128] | |
vmovupd ymm0, ymmword ptr [rsi] | |
movabs rcx, offset .rodata.cst32 | |
mov rax, rdi | |
vperm2f128 ymm4, ymm1, ymm3, 33 # ymm4 = ymm1[2,3],ymm3[0,1] | |
vperm2f128 ymm11, ymm8, ymm10, 33 # ymm11 = ymm8[2,3],ymm10[0,1] | |
vperm2f128 ymm9, ymm7, ymm8, 33 # ymm9 = ymm7[2,3],ymm8[0,1] | |
vperm2f128 ymm2, ymm0, ymm1, 33 # ymm2 = ymm0[2,3],ymm1[0,1] | |
vshufpd ymm4, ymm4, ymm3, 5 # ymm4 = ymm4[1],ymm3[0],ymm4[3],ymm3[2] | |
vperm2f128 ymm3, ymm3, ymm5, 33 # ymm3 = ymm3[2,3],ymm5[0,1] | |
vshufpd ymm8, ymm8, ymm11, 5 # ymm8 = ymm8[1],ymm11[0],ymm8[3],ymm11[2] | |
vmovapd ymm11, ymmword ptr [rcx] | |
vshufps ymm2, ymm2, ymm1, 3 # ymm2 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] | |
vshufps ymm2, ymm2, ymm1, 152 # ymm2 = ymm2[0,2],ymm1[1,2],ymm2[4,6],ymm1[5,6] | |
vshufps ymm2, ymm0, ymm2, 136 # ymm2 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] | |
vshufps ymm0, ymm0, ymm1, 141 # ymm0 = ymm0[1,3],ymm1[0,2],ymm0[5,7],ymm1[4,6] | |
vpermilps ymm0, ymm0, 216 # ymm0 = ymm0[0,2,1,3,4,6,5,7] | |
vpermps ymm11, ymm11, ymm10 | |
vshufps ymm6, ymm3, ymm5, 41 # ymm6 = ymm3[1,2],ymm5[2,0],ymm3[5,6],ymm5[6,4] | |
vperm2f128 ymm5, ymm5, ymm7, 33 # ymm5 = ymm5[2,3],ymm7[0,1] | |
vshufps ymm7, ymm7, ymm9, 3 # ymm7 = ymm7[3,0],ymm9[0,0],ymm7[7,4],ymm9[4,4] | |
vshufps ymm1, ymm4, ymm3, 216 # ymm1 = ymm4[0,2],ymm3[1,3],ymm4[4,6],ymm3[5,7] | |
vshufps ymm7, ymm7, ymm9, 152 # ymm7 = ymm7[0,2],ymm9[1,2],ymm7[4,6],ymm9[5,6] | |
vshufps ymm3, ymm4, ymm6, 221 # ymm3 = ymm4[1,3],ymm6[1,3],ymm4[5,7],ymm6[5,7] | |
vshufps ymm6, ymm8, ymm10, 216 # ymm6 = ymm8[0,2],ymm10[1,3],ymm8[4,6],ymm10[5,7] | |
vpermilps ymm3, ymm3, 216 # ymm3 = ymm3[0,2,1,3,4,6,5,7] | |
vshufps ymm4, ymm5, ymm7, 136 # ymm4 = ymm5[0,2],ymm7[0,2],ymm5[4,6],ymm7[4,6] | |
vshufps ymm7, ymm8, ymm11, 221 # ymm7 = ymm8[1,3],ymm11[1,3],ymm8[5,7],ymm11[5,7] | |
vshufps ymm5, ymm5, ymm9, 141 # ymm5 = ymm5[1,3],ymm9[0,2],ymm5[5,7],ymm9[4,6] | |
vshufps ymm8, ymm2, ymm1, 136 # ymm8 = ymm2[0,2],ymm1[0,2],ymm2[4,6],ymm1[4,6] | |
vshufps ymm1, ymm2, ymm1, 221 # ymm1 = ymm2[1,3],ymm1[1,3],ymm2[5,7],ymm1[5,7] | |
vunpcklpd ymm2, ymm0, ymm3 # ymm2 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] | |
vunpckhpd xmm0, xmm0, xmm3 # xmm0 = xmm0[1],xmm3[1] | |
vpermilps ymm5, ymm5, 216 # ymm5 = ymm5[0,2,1,3,4,6,5,7] | |
vpermilps ymm7, ymm7, 216 # ymm7 = ymm7[0,2,1,3,4,6,5,7] | |
vshufps ymm3, ymm4, ymm6, 136 # ymm3 = ymm4[0,2],ymm6[0,2],ymm4[4,6],ymm6[4,6] | |
vshufps ymm4, ymm4, ymm6, 221 # ymm4 = ymm4[1,3],ymm6[1,3],ymm4[5,7],ymm6[5,7] | |
vunpcklpd ymm6, ymm5, ymm7 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] | |
vunpckhpd xmm5, xmm5, xmm7 # xmm5 = xmm5[1],xmm7[1] | |
vinsertf128 ymm7, ymm8, xmm3, 1 | |
vperm2f128 ymm3, ymm8, ymm3, 49 # ymm3 = ymm8[2,3],ymm3[2,3] | |
vinsertf128 ymm8, ymm2, xmm6, 1 | |
vperm2f128 ymm2, ymm2, ymm6, 49 # ymm2 = ymm2[2,3],ymm6[2,3] | |
vinsertf128 ymm6, ymm1, xmm4, 1 | |
vperm2f128 ymm1, ymm1, ymm4, 49 # ymm1 = ymm1[2,3],ymm4[2,3] | |
vmovaps ymmword ptr [rdi], ymm7 | |
vmovaps ymmword ptr [rdi + 32], ymm8 | |
vmovaps ymmword ptr [rdi + 64], ymm6 | |
vmovaps xmmword ptr [rdi + 112], xmm5 | |
vmovaps xmmword ptr [rdi + 96], xmm0 | |
vmovaps ymmword ptr [rdi + 128], ymm3 | |
vmovaps ymmword ptr [rdi + 160], ymm2 | |
vmovaps ymmword ptr [rdi + 192], ymm1 | |
vzeroupper | |
ret | |
nop word ptr [rax + rax] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
vmovups ymm0, ymmword ptr [rsi] | |
vmovups ymm1, ymmword ptr [rsi + 32] | |
vmovups ymm2, ymmword ptr [rsi + 64] | |
vmovups ymm3, ymmword ptr [rsi + 96] | |
vmovups ymm5, ymmword ptr [rsi + 160] | |
vmovups ymm7, ymmword ptr [rsi + 224] | |
mov rax, rdi | |
vshufps ymm4, ymm0, ymm1, 136 # ymm4 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] | |
vshufps ymm0, ymm0, ymm1, 221 # ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] | |
vshufps ymm1, ymm2, ymm3, 136 # ymm1 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6] | |
vshufps ymm2, ymm2, ymm3, 221 # ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7] | |
vmovups ymm3, ymmword ptr [rsi + 128] | |
vshufps ymm6, ymm3, ymm5, 136 # ymm6 = ymm3[0,2],ymm5[0,2],ymm3[4,6],ymm5[4,6] | |
vshufps ymm3, ymm3, ymm5, 221 # ymm3 = ymm3[1,3],ymm5[1,3],ymm3[5,7],ymm5[5,7] | |
vmovups ymm5, ymmword ptr [rsi + 192] | |
vshufps ymm8, ymm5, ymm7, 136 # ymm8 = ymm5[0,2],ymm7[0,2],ymm5[4,6],ymm7[4,6] | |
vshufps ymm5, ymm5, ymm7, 221 # ymm5 = ymm5[1,3],ymm7[1,3],ymm5[5,7],ymm7[5,7] | |
vshufps ymm7, ymm4, ymm1, 136 # ymm7 = ymm4[0,2],ymm1[0,2],ymm4[4,6],ymm1[4,6] | |
vshufps ymm1, ymm4, ymm1, 221 # ymm1 = ymm4[1,3],ymm1[1,3],ymm4[5,7],ymm1[5,7] | |
vshufps ymm4, ymm0, ymm2, 136 # ymm4 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] | |
vshufps ymm0, ymm0, ymm2, 221 # ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] | |
vshufps ymm2, ymm6, ymm8, 136 # ymm2 = ymm6[0,2],ymm8[0,2],ymm6[4,6],ymm8[4,6] | |
vshufps ymm6, ymm6, ymm8, 221 # ymm6 = ymm6[1,3],ymm8[1,3],ymm6[5,7],ymm8[5,7] | |
vshufps ymm8, ymm3, ymm5, 136 # ymm8 = ymm3[0,2],ymm5[0,2],ymm3[4,6],ymm5[4,6] | |
vshufps ymm3, ymm3, ymm5, 221 # ymm3 = ymm3[1,3],ymm5[1,3],ymm3[5,7],ymm5[5,7] | |
vinsertf128 ymm5, ymm7, xmm2, 1 | |
vperm2f128 ymm2, ymm7, ymm2, 49 # ymm2 = ymm7[2,3],ymm2[2,3] | |
vinsertf128 ymm7, ymm4, xmm8, 1 | |
vperm2f128 ymm4, ymm4, ymm8, 49 # ymm4 = ymm4[2,3],ymm8[2,3] | |
vinsertf128 ymm8, ymm1, xmm6, 1 | |
vperm2f128 ymm1, ymm1, ymm6, 49 # ymm1 = ymm1[2,3],ymm6[2,3] | |
vinsertf128 ymm6, ymm0, xmm3, 1 | |
vperm2f128 ymm0, ymm0, ymm3, 49 # ymm0 = ymm0[2,3],ymm3[2,3] | |
vmovaps ymmword ptr [rdi], ymm5 | |
vmovaps ymmword ptr [rdi + 32], ymm7 | |
vmovaps ymmword ptr [rdi + 64], ymm8 | |
vmovaps ymmword ptr [rdi + 96], ymm6 | |
vmovaps ymmword ptr [rdi + 128], ymm2 | |
vmovaps ymmword ptr [rdi + 160], ymm4 | |
vmovaps ymmword ptr [rdi + 192], ymm1 | |
vmovaps ymmword ptr [rdi + 224], ymm0 | |
vzeroupper | |
ret | |
nop |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
vmovups ymm5, ymmword ptr [rsi + 96] | |
movabs rcx, offset .rodata.cst16 | |
vmovups ymm1, ymmword ptr [rsi] | |
vmovups ymm2, ymmword ptr [rsi + 32] | |
vmovups ymm4, ymmword ptr [rsi + 64] | |
vmovups ymm0, ymmword ptr [rsi + 128] | |
movabs rdx, 140200282622864 | |
mov rax, rdi | |
vmovaps xmm6, xmmword ptr [rcx] | |
movabs rcx, 140200282622784 | |
vpermpd ymm3, ymm5, 196 # ymm3 = ymm5[0,1,0,3] | |
vblendps ymm7, ymm1, ymm2, 204 # ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] | |
vblendps ymm8, ymm4, ymm5, 204 # ymm8 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] | |
vblendps ymm9, ymm1, ymm2, 48 # ymm9 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] | |
vpermps ymm6, ymm6, ymm7 | |
vblendps ymm7, ymm1, ymm2, 12 # ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] | |
vblendps ymm3, ymm3, ymm4, 16 # ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7] | |
vblendps ymm3, ymm6, ymm3, 240 # ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] | |
vbroadcastss ymm6, dword ptr [rsi + 140] | |
vblendps ymm3, ymm3, ymm6, 128 # ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] | |
vmovaps xmm6, xmmword ptr [rcx] | |
movabs rcx, 140200282622800 | |
vpermps ymm6, ymm6, ymm7 | |
vbroadcastf128 ymm7, xmmword ptr [rcx] # ymm7 = mem[0,1,0,1] | |
movabs rcx, 140200282622816 | |
vpermps ymm7, ymm7, ymm8 | |
vmovaps xmm8, xmmword ptr [rcx] | |
movabs rcx, offset .rodata.cst8 | |
vpermps ymm8, ymm8, ymm9 | |
vshufps ymm9, ymm1, ymm2, 3 # ymm9 = ymm1[3,0],ymm2[0,0],ymm1[7,4],ymm2[4,4] | |
vblendps ymm1, ymm2, ymm1, 48 # ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] | |
vshufps ymm9, ymm9, ymm2, 216 # ymm9 = ymm9[0,2],ymm2[1,3],ymm9[4,6],ymm2[5,7] | |
vbroadcastsd ymm2, qword ptr [rdx] | |
vpermpd ymm9, ymm9, 236 # ymm9 = ymm9[0,3,2,3] | |
vblendps ymm6, ymm7, ymm6, 7 # ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] | |
vbroadcastss ymm7, dword ptr [rsi + 144] | |
vblendps ymm6, ymm6, ymm7, 128 # ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] | |
vinsertf128 ymm7, ymm0, xmmword ptr [rsi + 96], 1 | |
vmovaps ymmword ptr [rdi], ymm3 | |
vmovaps ymmword ptr [rdi + 32], ymm6 | |
vshufps ymm7, ymm7, ymm4, 35 # ymm7 = ymm7[3,0],ymm4[2,0],ymm7[7,4],ymm4[6,4] | |
vshufps ymm7, ymm7, ymm4, 98 # ymm7 = ymm7[2,0],ymm4[2,1],ymm7[6,4],ymm4[6,5] | |
vblendps ymm7, ymm8, ymm7, 56 # ymm7 = ymm8[0,1,2],ymm7[3,4,5],ymm8[6,7] | |
vbroadcastsd ymm8, qword ptr [rcx] | |
movabs rcx, 140200282622856 | |
vpermps ymm8, ymm8, ymm0 | |
vblendps ymm7, ymm7, ymm8, 192 # ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] | |
vshufps ymm8, ymm5, ymm4, 48 # ymm8 = ymm5[0,0],ymm4[3,0],ymm5[4,4],ymm4[7,4] | |
vshufps ymm8, ymm8, ymm4, 162 # ymm8 = ymm8[2,0],ymm4[2,2],ymm8[6,4],ymm4[6,6] | |
vperm2f128 ymm4, ymm4, ymm5, 32 # ymm4 = ymm4[0,1],ymm5[0,1] | |
vmovaps ymmword ptr [rdi + 64], ymm7 | |
vblendps ymm8, ymm9, ymm8, 56 # ymm8 = ymm9[0,1,2],ymm8[3,4,5],ymm9[6,7] | |
vbroadcastsd ymm9, qword ptr [rcx] | |
movabs rcx, 140200282622832 | |
vpermps ymm9, ymm9, ymm0 | |
vpermps ymm0, ymm2, ymm0 | |
vblendps ymm4, ymm4, ymm5, 34 # ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] | |
vmovaps xmm5, xmmword ptr [rcx] | |
vpermps ymm1, ymm5, ymm1 | |
vblendps ymm8, ymm8, ymm9, 192 # ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] | |
vmovaps ymmword ptr [rdi + 96], ymm8 | |
vblendps ymm1, ymm1, ymm4, 56 # ymm1 = ymm1[0,1,2],ymm4[3,4,5],ymm1[6,7] | |
vblendps ymm0, ymm1, ymm0, 192 # ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] | |
vmovaps ymmword ptr [rdi + 128], ymm0 | |
vzeroupper | |
ret | |
nop dword ptr [rax + rax] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
vmovups ymm5, ymmword ptr [rsi + 96] | |
vmovups ymm6, ymmword ptr [rsi + 64] | |
movabs rcx, offset .rodata.cst16 | |
vmovups ymm2, ymmword ptr [rsi] | |
vmovups ymm4, ymmword ptr [rsi + 32] | |
vmovups ymm0, ymmword ptr [rsi + 128] | |
vmovups ymm1, ymmword ptr [rsi + 160] | |
mov rax, rdi | |
vmovaps xmm8, xmmword ptr [rcx] | |
movabs rcx, offset .rodata.cst8 | |
vbroadcastsd ymm14, qword ptr [rcx] | |
movabs rcx, 140200282654384 | |
vperm2f128 ymm7, ymm6, ymm5, 32 # ymm7 = ymm6[0,1],ymm5[0,1] | |
vblendps ymm9, ymm2, ymm4, 48 # ymm9 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] | |
vblendps ymm10, ymm1, ymm0, 48 # ymm10 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] | |
vshufps ymm12, ymm2, ymm4, 19 # ymm12 = ymm2[3,0],ymm4[1,0],ymm2[7,4],ymm4[5,4] | |
vpermps ymm8, ymm8, ymm9 | |
vshufps ymm12, ymm12, ymm4, 248 # ymm12 = ymm12[0,2],ymm4[3,3],ymm12[4,6],ymm4[7,7] | |
vpermps ymm11, ymm14, ymm10 | |
vpermpd ymm12, ymm12, 236 # ymm12 = ymm12[0,3,2,3] | |
vshufps ymm3, ymm5, ymm7, 2 # ymm3 = ymm5[2,0],ymm7[0,0],ymm5[6,4],ymm7[4,4] | |
vshufps ymm3, ymm3, ymm7, 162 # ymm3 = ymm3[2,0],ymm7[2,2],ymm3[6,4],ymm7[6,6] | |
vblendps ymm3, ymm8, ymm3, 56 # ymm3 = ymm8[0,1,2],ymm3[3,4,5],ymm8[6,7] | |
vblendps ymm3, ymm3, ymm11, 192 # ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] | |
vshufps ymm11, ymm5, ymm7, 19 # ymm11 = ymm5[3,0],ymm7[1,0],ymm5[7,4],ymm7[5,4] | |
vshufps ymm7, ymm11, ymm7, 226 # ymm7 = ymm11[2,0],ymm7[2,3],ymm11[6,4],ymm7[6,7] | |
vmovaps xmm11, xmmword ptr [rcx] | |
movabs rcx, 140200282654440 | |
vbroadcastsd ymm13, qword ptr [rcx] | |
movabs rcx, 140200282654400 | |
vpermps ymm9, ymm11, ymm9 | |
vshufps ymm11, ymm2, ymm4, 2 # ymm11 = ymm2[2,0],ymm4[0,0],ymm2[6,4],ymm4[4,4] | |
vblendps ymm2, ymm4, ymm2, 48 # ymm2 = ymm4[0,1,2,3],ymm2[4,5],ymm4[6,7] | |
vpermps ymm10, ymm13, ymm10 | |
vshufps ymm11, ymm11, ymm4, 232 # ymm11 = ymm11[0,2],ymm4[2,3],ymm11[4,6],ymm4[6,7] | |
vpermps ymm4, ymm14, ymm2 | |
vpermps ymm2, ymm13, ymm2 | |
vpermpd ymm11, ymm11, 236 # ymm11 = ymm11[0,3,2,3] | |
vblendps ymm7, ymm9, ymm7, 56 # ymm7 = ymm9[0,1,2],ymm7[3,4,5],ymm9[6,7] | |
vblendps ymm7, ymm7, ymm10, 192 # ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] | |
vshufps ymm10, ymm6, ymm5, 32 # ymm10 = ymm6[0,0],ymm5[2,0],ymm6[4,4],ymm5[6,4] | |
vshufps ymm10, ymm10, ymm5, 226 # ymm10 = ymm10[2,0],ymm5[2,3],ymm10[6,4],ymm5[6,7] | |
vpermpd ymm10, ymm10, 200 # ymm10 = ymm10[0,2,0,3] | |
vblendps ymm10, ymm11, ymm10, 24 # ymm10 = ymm11[0,1,2],ymm10[3,4],ymm11[5,6,7] | |
vshufps ymm11, ymm1, ymm0, 32 # ymm11 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] | |
vshufps ymm11, ymm0, ymm11, 32 # ymm11 = ymm0[0,0],ymm11[2,0],ymm0[4,4],ymm11[6,4] | |
vpermpd ymm11, ymm11, 196 # ymm11 = ymm11[0,1,0,3] | |
vblendps ymm10, ymm10, ymm11, 224 # ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7] | |
vshufps ymm11, ymm6, ymm5, 49 # ymm11 = ymm6[1,0],ymm5[3,0],ymm6[5,4],ymm5[7,4] | |
vshufps ymm11, ymm11, ymm5, 226 # ymm11 = ymm11[2,0],ymm5[2,3],ymm11[6,4],ymm5[6,7] | |
vblendps ymm5, ymm6, ymm5, 240 # ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] | |
vmovaps xmm6, xmmword ptr [rsi + 80] | |
vmovaps ymmword ptr [rdi], ymm3 | |
vmovaps ymmword ptr [rdi + 32], ymm7 | |
vmovaps ymmword ptr [rdi + 64], ymm10 | |
vpermpd ymm11, ymm11, 200 # ymm11 = ymm11[0,2,0,3] | |
vblendps ymm11, ymm12, ymm11, 24 # ymm11 = ymm12[0,1,2],ymm11[3,4],ymm12[5,6,7] | |
vshufps ymm12, ymm1, ymm0, 49 # ymm12 = ymm1[1,0],ymm0[3,0],ymm1[5,4],ymm0[7,4] | |
vshufps ymm12, ymm0, ymm12, 36 # ymm12 = ymm0[0,1],ymm12[2,0],ymm0[4,5],ymm12[6,4] | |
vblendps ymm0, ymm1, ymm0, 12 # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] | |
vbroadcastf128 ymm1, xmmword ptr [rcx] # ymm1 = mem[0,1,0,1] | |
movabs rcx, 140200282654416 | |
vpermpd ymm12, ymm12, 196 # ymm12 = ymm12[0,1,0,3] | |
vpermps ymm1, ymm1, ymm0 | |
vblendps ymm11, ymm11, ymm12, 224 # ymm11 = ymm11[0,1,2,3,4],ymm12[5,6,7] | |
vshufps ymm12, ymm6, ymm5, 2 # ymm12 = ymm6[2,0],ymm5[0,0],ymm6[6,4],ymm5[4,4] | |
vshufps ymm12, ymm5, ymm12, 36 # ymm12 = ymm5[0,1],ymm12[2,0],ymm5[4,5],ymm12[6,4] | |
vmovaps ymmword ptr [rdi + 96], ymm11 | |
vblendps ymm4, ymm12, ymm4, 3 # ymm4 = ymm4[0,1],ymm12[2,3,4,5,6,7] | |
vblendps ymm1, ymm4, ymm1, 224 # ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] | |
vshufps ymm4, ymm6, ymm5, 19 # ymm4 = ymm6[3,0],ymm5[1,0],ymm6[7,4],ymm5[5,4] | |
vshufps ymm4, ymm5, ymm4, 37 # ymm4 = ymm5[1,1],ymm4[2,0],ymm5[5,5],ymm4[6,4] | |
vbroadcastf128 ymm5, xmmword ptr [rcx] # ymm5 = mem[0,1,0,1] | |
vmovaps ymmword ptr [rdi + 128], ymm1 | |
vblendps ymm2, ymm4, ymm2, 3 # ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7] | |
vpermps ymm0, ymm5, ymm0 | |
vblendps ymm0, ymm2, ymm0, 224 # ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] | |
vmovaps ymmword ptr [rdi + 160], ymm0 | |
vzeroupper | |
ret | |
nop dword ptr [rax] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
vmovaps xmm7, xmmword ptr [rsi + 160] | |
vmovaps xmm0, xmmword ptr [rsi + 128] | |
vmovaps xmm4, xmmword ptr [rsi + 96] | |
vmovaps xmm6, xmmword ptr [rsi] | |
vmovaps xmm1, xmmword ptr [rsi + 32] | |
vmovaps xmm3, xmmword ptr [rsi + 64] | |
vmovaps xmm2, xmmword ptr [rsi + 192] | |
vmovups ymm11, ymmword ptr [rsi + 160] | |
vmovups ymm15, ymmword ptr [rsi + 128] | |
vmovups ymm8, ymmword ptr [rsi + 96] | |
vmovups ymm13, ymmword ptr [rsi + 64] | |
vmovups ymm9, ymmword ptr [rsi + 32] | |
vmovups ymm14, ymmword ptr [rsi] | |
vmovaps xmm12, xmmword ptr [rsi + 208] | |
mov rax, rdi | |
vinsertps xmm10, xmm0, xmm7, 28 # xmm10 = xmm0[0],xmm7[0],zero,zero | |
vinsertps xmm5, xmm6, xmm1, 28 # xmm5 = xmm6[0],xmm1[0],zero,zero | |
vmovss dword ptr [rdi + 24], xmm2 | |
vmovlps qword ptr [rdi + 16], xmm10 | |
vshufps xmm10, xmm4, xmm3, 0 # xmm10 = xmm4[0,0],xmm3[0,0] | |
vshufps xmm5, xmm5, xmm10, 36 # xmm5 = xmm5[0,1],xmm10[2,0] | |
vinsertps xmm10, xmm7, xmm0, 76 # xmm10 = xmm0[1],xmm7[1],zero,zero | |
vmovaps xmmword ptr [rdi], xmm5 | |
vmovlps qword ptr [rdi + 48], xmm10 | |
vunpcklps xmm10, xmm3, xmm4 # xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] | |
vinsertps xmm5, xmm1, xmm6, 76 # xmm5 = xmm6[1],xmm1[1],zero,zero | |
vextractps dword ptr [rdi + 56], xmm2, 1 | |
vblendps xmm5, xmm10, xmm5, 3 # xmm5 = xmm5[0,1],xmm10[2,3] | |
vunpckhps xmm10, xmm0, xmm7 # xmm10 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] | |
vmovaps xmmword ptr [rdi + 32], xmm5 | |
vmovlps qword ptr [rdi + 80], xmm10 | |
vinsertps xmm10, xmm3, xmm4, 179 # xmm10 = zero,zero,xmm3[2],xmm4[2] | |
vunpckhps xmm5, xmm6, xmm1 # xmm5 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] | |
vinsertps xmm3, xmm4, xmm3, 227 # xmm3 = zero,zero,xmm3[3],xmm4[3] | |
vshufps xmm1, xmm1, xmm6, 51 # xmm1 = xmm1[3,0],xmm6[3,0] | |
vextractps dword ptr [rdi + 88], xmm2, 2 | |
vblendps xmm5, xmm10, xmm5, 3 # xmm5 = xmm5[0,1],xmm10[2,3] | |
vshufps xmm1, xmm1, xmm3, 226 # xmm1 = xmm1[2,0],xmm3[2,3] | |
vmovaps xmmword ptr [rdi + 64], xmm5 | |
vshufps xmm5, xmm7, xmm0, 51 # xmm5 = xmm7[3,0],xmm0[3,0] | |
vmovaps xmmword ptr [rdi + 96], xmm1 | |
vunpcklps ymm1, ymm15, ymm11 # ymm1 = ymm15[0],ymm11[0],ymm15[1],ymm11[1],ymm15[4],ymm11[4],ymm15[5],ymm11[5] | |
vextractps dword ptr [rdi + 120], xmm2, 3 | |
vshufps xmm0, xmm5, xmm0, 226 # xmm0 = xmm5[2,0],xmm0[2,3] | |
vmovlps qword ptr [rdi + 112], xmm0 | |
vextractf128 xmm0, ymm1, 1 | |
vunpcklps ymm1, ymm13, ymm8 # ymm1 = ymm13[0],ymm8[0],ymm13[1],ymm8[1],ymm13[4],ymm8[4],ymm13[5],ymm8[5] | |
vmovss dword ptr [rdi + 152], xmm12 | |
vpermpd ymm1, ymm1, 232 # ymm1 = ymm1[0,2,2,3] | |
vmovlps qword ptr [rdi + 144], xmm0 | |
vunpcklps ymm0, ymm14, ymm9 # ymm0 = ymm14[0],ymm9[0],ymm14[1],ymm9[1],ymm14[4],ymm9[4],ymm14[5],ymm9[5] | |
vextractf128 xmm0, ymm0, 1 | |
vblendps xmm0, xmm1, xmm0, 3 # xmm0 = xmm0[0,1],xmm1[2,3] | |
vshufps ymm1, ymm11, ymm15, 17 # ymm1 = ymm11[1,0],ymm15[1,0],ymm11[5,4],ymm15[5,4] | |
vshufps ymm1, ymm1, ymm15, 226 # ymm1 = ymm1[2,0],ymm15[2,3],ymm1[6,4],ymm15[6,7] | |
vmovaps xmmword ptr [rdi + 128], xmm0 | |
vextractps dword ptr [rdi + 184], xmm12, 1 | |
vextractf128 xmm0, ymm1, 1 | |
vshufps ymm1, ymm8, ymm13, 17 # ymm1 = ymm8[1,0],ymm13[1,0],ymm8[5,4],ymm13[5,4] | |
vshufps ymm1, ymm1, ymm13, 226 # ymm1 = ymm1[2,0],ymm13[2,3],ymm1[6,4],ymm13[6,7] | |
vmovlps qword ptr [rdi + 176], xmm0 | |
vshufps ymm0, ymm9, ymm14, 17 # ymm0 = ymm9[1,0],ymm14[1,0],ymm9[5,4],ymm14[5,4] | |
vpermpd ymm1, ymm1, 232 # ymm1 = ymm1[0,2,2,3] | |
vshufps ymm0, ymm0, ymm14, 226 # ymm0 = ymm0[2,0],ymm14[2,3],ymm0[6,4],ymm14[6,7] | |
vextractf128 xmm0, ymm0, 1 | |
vblendps xmm0, xmm1, xmm0, 3 # xmm0 = xmm0[0,1],xmm1[2,3] | |
vunpckhps ymm1, ymm15, ymm11 # ymm1 = ymm15[2],ymm11[2],ymm15[3],ymm11[3],ymm15[6],ymm11[6],ymm15[7],ymm11[7] | |
vmovaps xmmword ptr [rdi + 160], xmm0 | |
vextractf128 xmm0, ymm1, 1 | |
vunpckhps ymm1, ymm13, ymm8 # ymm1 = ymm13[2],ymm8[2],ymm13[3],ymm8[3],ymm13[6],ymm8[6],ymm13[7],ymm8[7] | |
vextractps dword ptr [rdi + 216], xmm12, 2 | |
vpermpd ymm1, ymm1, 232 # ymm1 = ymm1[0,2,2,3] | |
vmovlps qword ptr [rdi + 208], xmm0 | |
vunpckhps ymm0, ymm14, ymm9 # ymm0 = ymm14[2],ymm9[2],ymm14[3],ymm9[3],ymm14[6],ymm9[6],ymm14[7],ymm9[7] | |
vextractf128 xmm0, ymm0, 1 | |
vblendps xmm0, xmm1, xmm0, 3 # xmm0 = xmm0[0,1],xmm1[2,3] | |
vshufps ymm1, ymm8, ymm13, 51 # ymm1 = ymm8[3,0],ymm13[3,0],ymm8[7,4],ymm13[7,4] | |
vshufps ymm1, ymm1, ymm13, 226 # ymm1 = ymm1[2,0],ymm13[2,3],ymm1[6,4],ymm13[6,7] | |
vmovaps xmmword ptr [rdi + 192], xmm0 | |
vshufps ymm0, ymm11, ymm15, 51 # ymm0 = ymm11[3,0],ymm15[3,0],ymm11[7,4],ymm15[7,4] | |
vextractps dword ptr [rdi + 248], xmm12, 3 | |
vpermpd ymm1, ymm1, 232 # ymm1 = ymm1[0,2,2,3] | |
vshufps ymm0, ymm0, ymm15, 226 # ymm0 = ymm0[2,0],ymm15[2,3],ymm0[6,4],ymm15[6,7] | |
vextractf128 xmm0, ymm0, 1 | |
vmovlps qword ptr [rdi + 240], xmm0 | |
vshufps ymm0, ymm9, ymm14, 51 # ymm0 = ymm9[3,0],ymm14[3,0],ymm9[7,4],ymm14[7,4] | |
vshufps ymm0, ymm0, ymm14, 226 # ymm0 = ymm0[2,0],ymm14[2,3],ymm0[6,4],ymm14[6,7] | |
vextractf128 xmm0, ymm0, 1 | |
vblendps xmm0, xmm1, xmm0, 3 # xmm0 = xmm0[0,1],xmm1[2,3] | |
vmovaps xmmword ptr [rdi + 224], xmm0 | |
vzeroupper | |
ret | |
nop word ptr cs:[rax + rax] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
vmovaps xmm10, xmmword ptr [rsi + 160] | |
vmovaps xmm3, xmmword ptr [rsi + 128] | |
vmovaps xmm5, xmmword ptr [rsi + 192] | |
vmovaps xmm4, xmmword ptr [rsi + 224] | |
vmovaps xmm12, xmmword ptr [rsi] | |
vmovaps xmm7, xmmword ptr [rsi + 32] | |
vmovaps xmm9, xmmword ptr [rsi + 96] | |
vmovups ymm13, ymmword ptr [rsi + 96] | |
vmovups ymm15, ymmword ptr [rsi + 64] | |
vmovups ymm11, ymmword ptr [rsi + 128] | |
vmovups ymm14, ymmword ptr [rsi + 32] | |
mov rax, rdi | |
vinsertps xmm0, xmm3, xmm10, 28 # xmm0 = xmm3[0],xmm10[0],zero,zero | |
vshufps xmm6, xmm4, xmm5, 0 # xmm6 = xmm4[0,0],xmm5[0,0] | |
vinsertps xmm2, xmm12, xmm7, 28 # xmm2 = xmm12[0],xmm7[0],zero,zero | |
vinsertf128 ymm0, ymm0, xmm0, 1 | |
vshufps xmm6, xmm5, xmm6, 36 # xmm6 = xmm5[0,1],xmm6[2,0] | |
vinsertf128 ymm6, ymm0, xmm6, 1 | |
vblendps ymm8, ymm0, ymm6, 192 # ymm8 = ymm0[0,1,2,3,4,5],ymm6[6,7] | |
vmovaps xmm0, xmmword ptr [rsi + 64] | |
vshufps xmm6, xmm9, xmm0, 0 # xmm6 = xmm9[0,0],xmm0[0,0] | |
vshufps xmm2, xmm2, xmm6, 36 # xmm2 = xmm2[0,1],xmm6[2,0] | |
vinsertps xmm6, xmm10, xmm3, 76 # xmm6 = xmm3[1],xmm10[1],zero,zero | |
vblendps ymm1, ymm2, ymm8, 240 # ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7] | |
vunpcklps xmm2, xmm5, xmm4 # xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] | |
vinsertf128 ymm6, ymm0, xmm6, 1 | |
vmovups ymm8, ymmword ptr [rsi + 192] | |
vinsertf128 ymm2, ymm0, xmm2, 1 | |
vmovups ymmword ptr [rsp - 56], ymm1 | |
vinsertps xmm1, xmm7, xmm12, 76 # xmm1 = xmm12[1],xmm7[1],zero,zero | |
vblendps ymm2, ymm6, ymm2, 192 # ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] | |
vunpcklps xmm6, xmm0, xmm9 # xmm6 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] | |
vblendps xmm1, xmm6, xmm1, 3 # xmm1 = xmm1[0,1],xmm6[2,3] | |
vunpckhps xmm6, xmm3, xmm10 # xmm6 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] | |
vblendps ymm1, ymm1, ymm2, 240 # ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] | |
vinsertf128 ymm6, ymm0, xmm6, 1 | |
vunpckhps xmm2, xmm12, xmm7 # xmm2 = xmm12[2],xmm7[2],xmm12[3],xmm7[3] | |
vmovups ymmword ptr [rsp - 88], ymm1 | |
vinsertps xmm1, xmm5, xmm4, 179 # xmm1 = zero,zero,xmm5[2],xmm4[2] | |
vinsertf128 ymm1, ymm0, xmm1, 1 | |
vblendps ymm1, ymm6, ymm1, 192 # ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] | |
vinsertps xmm6, xmm0, xmm9, 179 # xmm6 = zero,zero,xmm0[2],xmm9[2] | |
vblendps xmm2, xmm6, xmm2, 3 # xmm2 = xmm2[0,1],xmm6[2,3] | |
vmovups ymm6, ymmword ptr [rsi] | |
vblendps ymm1, ymm2, ymm1, 240 # ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] | |
vinsertps xmm2, xmm4, xmm5, 227 # xmm2 = zero,zero,xmm5[3],xmm4[3] | |
vshufps xmm5, xmm10, xmm3, 51 # xmm5 = xmm10[3,0],xmm3[3,0] | |
vmovups ymm4, ymmword ptr [rsi + 224] | |
vmovups ymm10, ymmword ptr [rsi + 160] | |
vshufps xmm3, xmm5, xmm3, 226 # xmm3 = xmm5[2,0],xmm3[2,3] | |
vinsertf128 ymm2, ymm0, xmm2, 1 | |
vunpckhps ymm5, ymm15, ymm13 # ymm5 = ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[6],ymm13[6],ymm15[7],ymm13[7] | |
vinsertf128 ymm3, ymm0, xmm3, 1 | |
vinsertps xmm0, xmm9, xmm0, 227 # xmm0 = zero,zero,xmm0[3],xmm9[3] | |
vpermpd ymm5, ymm5, 232 # ymm5 = ymm5[0,2,2,3] | |
vblendps ymm2, ymm3, ymm2, 192 # ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] | |
vshufps xmm3, xmm7, xmm12, 51 # xmm3 = xmm7[3,0],xmm12[3,0] | |
vshufps xmm0, xmm3, xmm0, 226 # xmm0 = xmm3[2,0],xmm0[2,3] | |
vunpcklps ymm3, ymm15, ymm13 # ymm3 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[4],ymm13[4],ymm15[5],ymm13[5] | |
vpermpd ymm3, ymm3, 232 # ymm3 = ymm3[0,2,2,3] | |
vblendps ymm7, ymm0, ymm2, 240 # ymm7 = ymm0[0,1,2,3],ymm2[4,5,6,7] | |
vshufps ymm2, ymm4, ymm8, 0 # ymm2 = ymm4[0,0],ymm8[0,0],ymm4[4,4],ymm8[4,4] | |
vunpcklps ymm0, ymm11, ymm10 # ymm0 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] | |
vshufps ymm2, ymm8, ymm2, 36 # ymm2 = ymm8[0,1],ymm2[2,0],ymm8[4,5],ymm2[6,4] | |
vblendps ymm0, ymm0, ymm2, 192 # ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] | |
vunpcklps ymm2, ymm6, ymm14 # ymm2 = ymm6[0],ymm14[0],ymm6[1],ymm14[1],ymm6[4],ymm14[4],ymm6[5],ymm14[5] | |
vextractf128 xmm2, ymm2, 1 | |
vblendps xmm2, xmm3, xmm2, 3 # xmm2 = xmm2[0,1],xmm3[2,3] | |
vshufps ymm3, ymm13, ymm15, 17 # ymm3 = ymm13[1,0],ymm15[1,0],ymm13[5,4],ymm15[5,4] | |
vshufps ymm3, ymm3, ymm15, 226 # ymm3 = ymm3[2,0],ymm15[2,3],ymm3[6,4],ymm15[6,7] | |
vblendps ymm12, ymm2, ymm0, 240 # ymm12 = ymm2[0,1,2,3],ymm0[4,5,6,7] | |
vshufps ymm2, ymm10, ymm11, 17 # ymm2 = ymm10[1,0],ymm11[1,0],ymm10[5,4],ymm11[5,4] | |
vunpcklps ymm0, ymm8, ymm4 # ymm0 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[4],ymm4[4],ymm8[5],ymm4[5] | |
vpermpd ymm3, ymm3, 232 # ymm3 = ymm3[0,2,2,3] | |
vshufps ymm2, ymm2, ymm11, 226 # ymm2 = ymm2[2,0],ymm11[2,3],ymm2[6,4],ymm11[6,7] | |
vblendps ymm0, ymm2, ymm0, 192 # ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] | |
vshufps ymm2, ymm14, ymm6, 17 # ymm2 = ymm14[1,0],ymm6[1,0],ymm14[5,4],ymm6[5,4] | |
vshufps ymm2, ymm2, ymm6, 226 # ymm2 = ymm2[2,0],ymm6[2,3],ymm2[6,4],ymm6[6,7] | |
vextractf128 xmm2, ymm2, 1 | |
vblendps xmm2, xmm3, xmm2, 3 # xmm2 = xmm2[0,1],xmm3[2,3] | |
vshufps ymm3, ymm4, ymm8, 34 # ymm3 = ymm4[2,0],ymm8[2,0],ymm4[6,4],ymm8[6,4] | |
vblendps ymm0, ymm2, ymm0, 240 # ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] | |
vunpckhps ymm2, ymm11, ymm10 # ymm2 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] | |
vshufps ymm3, ymm8, ymm3, 36 # ymm3 = ymm8[0,1],ymm3[2,0],ymm8[4,5],ymm3[6,4] | |
vblendps ymm2, ymm2, ymm3, 192 # ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] | |
vunpckhps ymm3, ymm6, ymm14 # ymm3 = ymm6[2],ymm14[2],ymm6[3],ymm14[3],ymm6[6],ymm14[6],ymm6[7],ymm14[7] | |
vextractf128 xmm3, ymm3, 1 | |
vblendps xmm3, xmm5, xmm3, 3 # xmm3 = xmm3[0,1],xmm5[2,3] | |
vshufps ymm5, ymm13, ymm15, 51 # ymm5 = ymm13[3,0],ymm15[3,0],ymm13[7,4],ymm15[7,4] | |
vshufps ymm5, ymm5, ymm15, 226 # ymm5 = ymm5[2,0],ymm15[2,3],ymm5[6,4],ymm15[6,7] | |
vblendps ymm2, ymm3, ymm2, 240 # ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] | |
vunpckhps ymm3, ymm8, ymm4 # ymm3 = ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[6],ymm4[6],ymm8[7],ymm4[7] | |
vshufps ymm4, ymm10, ymm11, 51 # ymm4 = ymm10[3,0],ymm11[3,0],ymm10[7,4],ymm11[7,4] | |
vpermpd ymm5, ymm5, 232 # ymm5 = ymm5[0,2,2,3] | |
vshufps ymm4, ymm4, ymm11, 226 # ymm4 = ymm4[2,0],ymm11[2,3],ymm4[6,4],ymm11[6,7] | |
vblendps ymm3, ymm4, ymm3, 192 # ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] | |
vshufps ymm4, ymm14, ymm6, 51 # ymm4 = ymm14[3,0],ymm6[3,0],ymm14[7,4],ymm6[7,4] | |
vshufps ymm4, ymm4, ymm6, 226 # ymm4 = ymm4[2,0],ymm6[2,3],ymm4[6,4],ymm6[6,7] | |
vmovups ymm6, ymmword ptr [rsp - 56] | |
vextractf128 xmm4, ymm4, 1 | |
vmovaps ymmword ptr [rdi], ymm6 | |
vblendps xmm4, xmm5, xmm4, 3 # xmm4 = xmm4[0,1],xmm5[2,3] | |
vmovups ymm5, ymmword ptr [rsp - 88] | |
vblendps ymm3, ymm4, ymm3, 240 # ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] | |
vmovaps ymmword ptr [rdi + 32], ymm5 | |
vmovaps ymmword ptr [rdi + 64], ymm1 | |
vmovaps ymmword ptr [rdi + 96], ymm7 | |
vmovaps ymmword ptr [rdi + 128], ymm12 | |
vmovaps ymmword ptr [rdi + 160], ymm0 | |
vmovaps ymmword ptr [rdi + 192], ymm2 | |
vmovaps ymmword ptr [rdi + 224], ymm3 | |
vzeroupper | |
ret | |
nop word ptr [rax + rax] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment