Last active
November 20, 2024 07:57
-
-
Save minjang/b01096455fc71f50715af39d02c4b190 to your computer and use it in GitHub Desktop.
x86-64 (AVX512) for matmul_kernel (03-matrix-multiplication-cpu.py) from TTMIR
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| .text | |
| .file "LLVMDialectModule" | |
| .section .rodata,"a",@progbits | |
| .p2align 6, 0x0 # -- Begin function matmul_kernel | |
| .LCPI0_0: | |
| .zero 4 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 14 # 0xe | |
| .long 15 # 0xf | |
| .LCPI0_15: | |
| .long 3 # 0x3 | |
| .long 19 # 0x13 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 7 # 0x7 | |
| .long 23 # 0x17 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 11 # 0xb | |
| .long 27 # 0x1b | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 15 # 0xf | |
| .long 31 # 0x1f | |
| .long 14 # 0xe | |
| .long 15 # 0xf | |
| .LCPI0_16: | |
| .quad 2 # 0x2 | |
| .quad 10 # 0xa | |
| .quad 2 # 0x2 | |
| .quad 10 # 0xa | |
| .quad 6 # 0x6 | |
| .quad 15 # 0xf | |
| .quad 6 # 0x6 | |
| .quad 14 # 0xe | |
| .LCPI0_17: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 18 # 0x12 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 22 # 0x16 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 26 # 0x1a | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 14 # 0xe | |
| .long 30 # 0x1e | |
| .LCPI0_18: | |
| .long 1 # 0x1 | |
| .long 17 # 0x11 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 5 # 0x5 | |
| .long 21 # 0x15 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 9 # 0x9 | |
| .long 25 # 0x19 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 13 # 0xd | |
| .long 29 # 0x1d | |
| .long 14 # 0xe | |
| .long 15 # 0xf | |
| .LCPI0_19: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 0 # 0x0 | |
| .long 16 # 0x10 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 4 # 0x4 | |
| .long 20 # 0x14 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 8 # 0x8 | |
| .long 24 # 0x18 | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 12 # 0xc | |
| .long 28 # 0x1c | |
| .LCPI0_20: | |
| .quad 0 # 0x0 | |
| .quad 8 # 0x8 | |
| .quad 0 # 0x0 | |
| .quad 8 # 0x8 | |
| .quad 4 # 0x4 | |
| .quad 12 # 0xc | |
| .quad 4 # 0x4 | |
| .quad 13 # 0xd | |
| .LCPI0_40: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 25 # 0x19 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_47: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 3 # 0x3 | |
| .quad 13 # 0xd | |
| .zero 8 | |
| .zero 8 | |
| .zero 8 | |
| .LCPI0_48: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 26 # 0x1a | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_55: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 27 # 0x1b | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_56: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 27 # 0x1b | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_57: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 27 # 0x1b | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_64: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 28 # 0x1c | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_65: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 3 # 0x3 | |
| .quad 4 # 0x4 | |
| .quad 14 # 0xe | |
| .zero 8 | |
| .zero 8 | |
| .LCPI0_66: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 28 # 0x1c | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_73: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 29 # 0x1d | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_74: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 29 # 0x1d | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_75: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 29 # 0x1d | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_76: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 29 # 0x1d | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_77: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 29 # 0x1d | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_84: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 3 # 0x3 | |
| .quad 15 # 0xf | |
| .zero 8 | |
| .zero 8 | |
| .zero 8 | |
| .LCPI0_85: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 30 # 0x1e | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_86: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 3 # 0x3 | |
| .quad 4 # 0x4 | |
| .quad 15 # 0xf | |
| .zero 8 | |
| .zero 8 | |
| .LCPI0_87: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 30 # 0x1e | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_88: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 3 # 0x3 | |
| .quad 4 # 0x4 | |
| .quad 5 # 0x5 | |
| .quad 15 # 0xf | |
| .zero 8 | |
| .LCPI0_89: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 30 # 0x1e | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_96: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 31 # 0x1f | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_97: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 31 # 0x1f | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_98: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 31 # 0x1f | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_99: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 31 # 0x1f | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_100: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 31 # 0x1f | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_101: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 31 # 0x1f | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_102: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 31 # 0x1f | |
| .zero 4 | |
| .LCPI0_109: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 16 # 0x10 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_110: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 17 # 0x11 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_111: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 17 # 0x11 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_112: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 17 # 0x11 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_113: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 3 # 0x3 | |
| .quad 9 # 0x9 | |
| .zero 8 | |
| .zero 8 | |
| .zero 8 | |
| .LCPI0_114: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 18 # 0x12 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_115: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 19 # 0x13 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_116: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 19 # 0x13 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_117: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 19 # 0x13 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_119: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 20 # 0x14 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_123: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 21 # 0x15 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_124: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 21 # 0x15 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_125: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 21 # 0x15 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_126: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 21 # 0x15 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_127: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 21 # 0x15 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_128: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 21 # 0x15 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_129: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 21 # 0x15 | |
| .zero 4 | |
| .LCPI0_130: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 14 # 0xe | |
| .long 21 # 0x15 | |
| .LCPI0_134: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 3 # 0x3 | |
| .quad 11 # 0xb | |
| .zero 8 | |
| .zero 8 | |
| .zero 8 | |
| .LCPI0_135: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 22 # 0x16 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_136: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 3 # 0x3 | |
| .quad 4 # 0x4 | |
| .quad 11 # 0xb | |
| .zero 8 | |
| .zero 8 | |
| .LCPI0_137: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 22 # 0x16 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_138: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 3 # 0x3 | |
| .quad 4 # 0x4 | |
| .quad 5 # 0x5 | |
| .quad 11 # 0xb | |
| .zero 8 | |
| .LCPI0_139: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 22 # 0x16 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_140: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 3 # 0x3 | |
| .quad 4 # 0x4 | |
| .quad 5 # 0x5 | |
| .quad 6 # 0x6 | |
| .quad 11 # 0xb | |
| .LCPI0_141: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 14 # 0xe | |
| .long 22 # 0x16 | |
| .LCPI0_145: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 23 # 0x17 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_146: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 23 # 0x17 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_147: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 23 # 0x17 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_148: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 23 # 0x17 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_149: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 23 # 0x17 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_150: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 23 # 0x17 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_151: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 23 # 0x17 | |
| .zero 4 | |
| .LCPI0_152: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 14 # 0xe | |
| .long 23 # 0x17 | |
| .LCPI0_159: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 24 # 0x18 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_160: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 3 # 0x3 | |
| .quad 4 # 0x4 | |
| .quad 12 # 0xc | |
| .zero 8 | |
| .zero 8 | |
| .LCPI0_161: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 24 # 0x18 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_162: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 3 # 0x3 | |
| .quad 4 # 0x4 | |
| .quad 5 # 0x5 | |
| .quad 12 # 0xc | |
| .zero 8 | |
| .LCPI0_163: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 24 # 0x18 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_164: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 3 # 0x3 | |
| .quad 4 # 0x4 | |
| .quad 5 # 0x5 | |
| .quad 6 # 0x6 | |
| .quad 12 # 0xc | |
| .LCPI0_165: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 14 # 0xe | |
| .long 24 # 0x18 | |
| .LCPI0_166: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 25 # 0x19 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_167: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 25 # 0x19 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_168: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 25 # 0x19 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_169: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 25 # 0x19 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_170: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 25 # 0x19 | |
| .zero 4 | |
| .LCPI0_171: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 14 # 0xe | |
| .long 25 # 0x19 | |
| .LCPI0_172: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 26 # 0x1a | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_173: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 3 # 0x3 | |
| .quad 4 # 0x4 | |
| .quad 5 # 0x5 | |
| .quad 13 # 0xd | |
| .zero 8 | |
| .LCPI0_174: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 26 # 0x1a | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_175: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 3 # 0x3 | |
| .quad 4 # 0x4 | |
| .quad 5 # 0x5 | |
| .quad 6 # 0x6 | |
| .quad 13 # 0xd | |
| .LCPI0_176: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 14 # 0xe | |
| .long 26 # 0x1a | |
| .LCPI0_177: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 27 # 0x1b | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_178: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 27 # 0x1b | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_179: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 27 # 0x1b | |
| .zero 4 | |
| .LCPI0_180: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 14 # 0xe | |
| .long 27 # 0x1b | |
| .LCPI0_181: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 28 # 0x1c | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_182: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 3 # 0x3 | |
| .quad 4 # 0x4 | |
| .quad 5 # 0x5 | |
| .quad 6 # 0x6 | |
| .quad 14 # 0xe | |
| .LCPI0_183: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 14 # 0xe | |
| .long 28 # 0x1c | |
| .LCPI0_184: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 29 # 0x1d | |
| .zero 4 | |
| .LCPI0_185: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 14 # 0xe | |
| .long 29 # 0x1d | |
| .LCPI0_186: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 10 # 0xa | |
| .long 11 # 0xb | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 14 # 0xe | |
| .long 30 # 0x1e | |
| .LCPI0_193: | |
| .zero 64 | |
| .section .rodata.cst4,"aM",@progbits,4 | |
| .p2align 2, 0x0 | |
| .LCPI0_1: | |
| .long 14 # 0xe | |
| .LCPI0_2: | |
| .long 13 # 0xd | |
| .LCPI0_3: | |
| .long 12 # 0xc | |
| .LCPI0_4: | |
| .long 11 # 0xb | |
| .LCPI0_5: | |
| .long 10 # 0xa | |
| .LCPI0_6: | |
| .long 8 # 0x8 | |
| .LCPI0_7: | |
| .long 6 # 0x6 | |
| .LCPI0_8: | |
| .long 4 # 0x4 | |
| .LCPI0_9: | |
| .long 2 # 0x2 | |
| .LCPI0_103: | |
| .long 15 # 0xf | |
| .LCPI0_104: | |
| .long 9 # 0x9 | |
| .LCPI0_105: | |
| .long 7 # 0x7 | |
| .LCPI0_106: | |
| .long 5 # 0x5 | |
| .LCPI0_107: | |
| .long 3 # 0x3 | |
| .LCPI0_108: | |
| .long 1 # 0x1 | |
| .section .rodata.cst32,"aM",@progbits,32 | |
| .p2align 5, 0x0 | |
| .LCPI0_10: | |
| .long 3 # 0x3 | |
| .long 11 # 0xb | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 7 # 0x7 | |
| .long 15 # 0xf | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .LCPI0_14: | |
| .long 3 # 0x3 | |
| .long 11 # 0xb | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 7 # 0x7 | |
| .long 15 # 0xf | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_24: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 10 # 0xa | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 14 # 0xe | |
| .LCPI0_26: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 0 # 0x0 | |
| .long 8 # 0x8 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 4 # 0x4 | |
| .long 12 # 0xc | |
| .LCPI0_27: | |
| .long 1 # 0x1 | |
| .long 9 # 0x9 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 5 # 0x5 | |
| .long 13 # 0xd | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .LCPI0_31: | |
| .long 1 # 0x1 | |
| .long 9 # 0x9 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 5 # 0x5 | |
| .long 13 # 0xd | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_36: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 25 # 0x19 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_37: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 25 # 0x19 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_38: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 25 # 0x19 | |
| .zero 4 | |
| .LCPI0_39: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 25 # 0x19 | |
| .LCPI0_43: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 13 # 0xd | |
| .zero 8 | |
| .LCPI0_44: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 26 # 0x1a | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_45: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 13 # 0xd | |
| .LCPI0_46: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 26 # 0x1a | |
| .LCPI0_51: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 27 # 0x1b | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_52: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 27 # 0x1b | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_53: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 27 # 0x1b | |
| .zero 4 | |
| .LCPI0_54: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 27 # 0x1b | |
| .LCPI0_60: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 14 # 0xe | |
| .zero 8 | |
| .LCPI0_61: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 28 # 0x1c | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_62: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 14 # 0xe | |
| .LCPI0_63: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 28 # 0x1c | |
| .LCPI0_69: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 29 # 0x1d | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_70: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 29 # 0x1d | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_71: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 29 # 0x1d | |
| .zero 4 | |
| .LCPI0_72: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 29 # 0x1d | |
| .LCPI0_80: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 15 # 0xf | |
| .zero 8 | |
| .LCPI0_81: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 30 # 0x1e | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_82: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 15 # 0xf | |
| .LCPI0_83: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 30 # 0x1e | |
| .LCPI0_92: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 31 # 0x1f | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_93: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 31 # 0x1f | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_94: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 31 # 0x1f | |
| .zero 4 | |
| .LCPI0_95: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 31 # 0x1f | |
| .LCPI0_118: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 6 # 0x6 | |
| .LCPI0_120: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 13 # 0xd | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_121: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 13 # 0xd | |
| .zero 4 | |
| .LCPI0_122: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 13 # 0xd | |
| .LCPI0_131: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 7 # 0x7 | |
| .zero 8 | |
| .LCPI0_132: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 14 # 0xe | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_133: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 14 # 0xe | |
| .LCPI0_142: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 15 # 0xf | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_143: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 15 # 0xf | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_144: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 15 # 0xf | |
| .zero 4 | |
| .LCPI0_155: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 12 # 0xc | |
| .zero 8 | |
| .LCPI0_156: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 24 # 0x18 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_157: | |
| .quad 0 # 0x0 | |
| .quad 1 # 0x1 | |
| .quad 2 # 0x2 | |
| .quad 12 # 0xc | |
| .LCPI0_158: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 3 # 0x3 | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 24 # 0x18 | |
| .LCPI0_187: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .long 4 # 0x4 | |
| .long 12 # 0xc | |
| .LCPI0_188: | |
| .long 1 # 0x1 | |
| .long 9 # 0x9 | |
| .zero 4 | |
| .zero 4 | |
| .long 5 # 0x5 | |
| .long 13 # 0xd | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_189: | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .long 5 # 0x5 | |
| .long 13 # 0xd | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_190: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .long 6 # 0x6 | |
| .long 14 # 0xe | |
| .LCPI0_191: | |
| .long 3 # 0x3 | |
| .long 11 # 0xb | |
| .zero 4 | |
| .zero 4 | |
| .long 7 # 0x7 | |
| .long 15 # 0xf | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_192: | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .zero 4 | |
| .long 7 # 0x7 | |
| .long 15 # 0xf | |
| .zero 4 | |
| .zero 4 | |
| .section .rodata.cst16,"aM",@progbits,16 | |
| .p2align 4, 0x0 | |
| .LCPI0_11: | |
| .long 7 # 0x7 | |
| .long 23 # 0x17 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .LCPI0_12: | |
| .long 7 # 0x7 | |
| .long 15 # 0xf | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .LCPI0_21: | |
| .quad 0 # 0x0 | |
| .quad 8 # 0x8 | |
| .LCPI0_22: | |
| .zero 4 | |
| .zero 4 | |
| .long 4 # 0x4 | |
| .long 0 # 0x0 | |
| .LCPI0_23: | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 6 # 0x6 | |
| .long 22 # 0x16 | |
| .LCPI0_25: | |
| .long 4 # 0x4 | |
| .long 5 # 0x5 | |
| .long 4 # 0x4 | |
| .long 20 # 0x14 | |
| .LCPI0_28: | |
| .long 5 # 0x5 | |
| .long 21 # 0x15 | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .LCPI0_29: | |
| .long 5 # 0x5 | |
| .long 13 # 0xd | |
| .long 6 # 0x6 | |
| .long 7 # 0x7 | |
| .LCPI0_33: | |
| .long 3 # 0x3 | |
| .long 7 # 0x7 | |
| .zero 4 | |
| .zero 4 | |
| .LCPI0_34: | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 25 # 0x19 | |
| .zero 4 | |
| .LCPI0_35: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 25 # 0x19 | |
| .LCPI0_41: | |
| .quad 4 # 0x4 | |
| .quad 13 # 0xd | |
| .LCPI0_42: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 26 # 0x1a | |
| .LCPI0_49: | |
| .long 8 # 0x8 | |
| .long 9 # 0x9 | |
| .long 27 # 0x1b | |
| .zero 4 | |
| .LCPI0_50: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 27 # 0x1b | |
| .LCPI0_58: | |
| .quad 6 # 0x6 | |
| .quad 14 # 0xe | |
| .LCPI0_59: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 28 # 0x1c | |
| .LCPI0_67: | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 29 # 0x1d | |
| .zero 4 | |
| .LCPI0_68: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 29 # 0x1d | |
| .LCPI0_78: | |
| .quad 6 # 0x6 | |
| .quad 15 # 0xf | |
| .LCPI0_79: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 30 # 0x1e | |
| .LCPI0_90: | |
| .long 12 # 0xc | |
| .long 13 # 0xd | |
| .long 31 # 0x1f | |
| .zero 4 | |
| .LCPI0_91: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 31 # 0x1f | |
| .LCPI0_153: | |
| .quad 4 # 0x4 | |
| .quad 12 # 0xc | |
| .LCPI0_154: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 2 # 0x2 | |
| .long 24 # 0x18 | |
| .LCPI0_195: | |
| .long 3 # 0x3 | |
| .long 7 # 0x7 | |
| .long 0 # 0x0 | |
| .long 0 # 0x0 | |
| .LCPI0_196: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 4 # 0x4 | |
| .long 12 # 0xc | |
| .LCPI0_197: | |
| .long 0 # 0x0 | |
| .long 1 # 0x1 | |
| .long 6 # 0x6 | |
| .long 14 # 0xe | |
| .section .rodata.cst8,"aM",@progbits,8 | |
| .p2align 3, 0x0 | |
| .LCPI0_13: | |
| .long 7 # 0x7 | |
| .long 15 # 0xf | |
| .LCPI0_30: | |
| .long 5 # 0x5 | |
| .long 13 # 0xd | |
| .LCPI0_32: | |
| .long 3 # 0x3 | |
| .long 19 # 0x13 | |
| .LCPI0_194: | |
| .long 4 # 0x4 | |
| .long 0 # 0x0 | |
| .text | |
| .globl matmul_kernel | |
| .p2align 4, 0x90 | |
| .type matmul_kernel,@function | |
| matmul_kernel: # @matmul_kernel | |
| .Lfunc_begin0: | |
| .file 1 "/data/users/minjang/triton-oss/triton-cpu/python/tutorials" "03-matrix-multiplication-cpu.py" | |
| .loc 1 166 0 # 03-matrix-multiplication-cpu.py:166:0 | |
| .cfi_sections .debug_frame | |
| .cfi_startproc | |
| # %bb.0: | |
| pushq %rbp | |
| .cfi_def_cfa_offset 16 | |
| pushq %r15 | |
| .cfi_def_cfa_offset 24 | |
| pushq %r14 | |
| .cfi_def_cfa_offset 32 | |
| pushq %r13 | |
| .cfi_def_cfa_offset 40 | |
| pushq %r12 | |
| .cfi_def_cfa_offset 48 | |
| pushq %rbx | |
| .cfi_def_cfa_offset 56 | |
| subq $3448, %rsp # imm = 0xD78 | |
| .cfi_def_cfa_offset 3504 | |
| .cfi_offset %rbx, -56 | |
| .cfi_offset %r12, -48 | |
| .cfi_offset %r13, -40 | |
| .cfi_offset %r14, -32 | |
| .cfi_offset %r15, -24 | |
| .cfi_offset %rbp, -16 | |
| # kill: def $ecx killed $ecx def $rcx | |
| .Ltmp0: | |
| .file 2 "/data/users/minjang/triton-oss/triton-cpu/python/triton/language" "standard.py" | |
| .loc 2 40 22 prologue_end # standard.py:40:22 | |
| leal 15(%rcx), %eax | |
| movl 3528(%rsp), %ebp | |
| .loc 2 40 28 is_stmt 0 # standard.py:40:28 | |
| leal 30(%rcx), %r11d | |
| .Ltmp1: | |
| # kill: def $r8d killed $r8d def $r8 | |
| .loc 2 40 28 # standard.py:40:28 | |
| leal 30(%r8), %r10d | |
| movq %rdx, 496(%rsp) # 8-byte Spill | |
| movl $8, %ebx | |
| .Ltmp2: | |
| # kill: def $r9d killed $r9d def $r9 | |
| .loc 2 40 28 # standard.py:40:28 | |
| testl %eax, %eax | |
| cmovnsl %eax, %r11d | |
| .Ltmp3: | |
| .loc 2 40 22 # standard.py:40:22 | |
| leal 15(%r8), %eax | |
| .Ltmp4: | |
| .loc 2 40 28 # standard.py:40:28 | |
| sarl $4, %r11d | |
| .Ltmp5: | |
| .loc 2 40 28 # standard.py:40:28 | |
| testl %eax, %eax | |
| cmovnsl %eax, %r10d | |
| sarl $4, %r10d | |
| .Ltmp6: | |
| .loc 1 192 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:192:22 | |
| movl %ebp, %eax | |
| cltd | |
| .loc 1 191 38 # 03-matrix-multiplication-cpu.py:191:38 | |
| shll $3, %r10d | |
| .loc 1 192 22 # 03-matrix-multiplication-cpu.py:192:22 | |
| idivl %r10d | |
| movl %eax, %r14d | |
| .loc 1 193 29 # 03-matrix-multiplication-cpu.py:193:29 | |
| leal (,%r14,8), %eax | |
| .loc 1 194 35 # 03-matrix-multiplication-cpu.py:194:35 | |
| subl %eax, %r11d | |
| .loc 1 195 33 # 03-matrix-multiplication-cpu.py:195:33 | |
| movl %ebp, %eax | |
| cltd | |
| .loc 1 194 48 # 03-matrix-multiplication-cpu.py:194:48 | |
| cmpl $8, %r11d | |
| cmovll %r11d, %ebx | |
| .loc 1 196 19 # 03-matrix-multiplication-cpu.py:196:19 | |
| imull %r14d, %r10d | |
| .loc 1 195 33 # 03-matrix-multiplication-cpu.py:195:33 | |
| idivl %ebx | |
| # kill: def $edx killed $edx def $rdx | |
| .loc 1 195 27 is_stmt 0 # 03-matrix-multiplication-cpu.py:195:27 | |
| leal (%rdx,%r14,8), %r15d | |
| .loc 1 196 19 is_stmt 1 # 03-matrix-multiplication-cpu.py:196:19 | |
| subl %r10d, %ebp | |
| .loc 1 205 38 # 03-matrix-multiplication-cpu.py:205:38 | |
| vpbroadcastd %r15d, %zmm0 | |
| .loc 1 205 23 is_stmt 0 # 03-matrix-multiplication-cpu.py:205:23 | |
| shll $4, %r15d | |
| .loc 1 196 40 is_stmt 1 # 03-matrix-multiplication-cpu.py:196:40 | |
| movl %ebp, %eax | |
| cltd | |
| .loc 1 205 38 # 03-matrix-multiplication-cpu.py:205:38 | |
| vpslld $4, %zmm0, %zmm0 | |
| vpord .LCPI0_0(%rip), %zmm0, %zmm2 | |
| .loc 1 196 40 # 03-matrix-multiplication-cpu.py:196:40 | |
| idivl %ebx | |
| .Ltmp7: | |
| .loc 2 40 22 # standard.py:40:22 | |
| leal 15(%r9), %edx | |
| .loc 2 40 28 is_stmt 0 # standard.py:40:28 | |
| leal 30(%r9), %ebx | |
| movq %r8, 488(%rsp) # 8-byte Spill | |
| movq %rcx, 480(%rsp) # 8-byte Spill | |
| movl %r15d, -112(%rsp) # 4-byte Spill | |
| vextracti32x4 $3, %zmm2, %xmm1 | |
| vextracti32x4 $2, %zmm2, %xmm3 | |
| vmovdqu64 %zmm2, 3248(%rsp) # 64-byte Spill | |
| vmovdqa %xmm1, 1408(%rsp) # 16-byte Spill | |
| vmovdqa %xmm3, 1392(%rsp) # 16-byte Spill | |
| .Ltmp8: | |
| .loc 1 206 23 is_stmt 1 # 03-matrix-multiplication-cpu.py:206:23 | |
| shll $4, %eax | |
| .Ltmp9: | |
| .loc 2 40 28 # standard.py:40:28 | |
| testl %edx, %edx | |
| cmovnsl %edx, %ebx | |
| movl %eax, -108(%rsp) # 4-byte Spill | |
| .Ltmp10: | |
| .loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
| cmpl $16, %edx | |
| jl .LBB0_1 | |
| # %bb.2: # %.lr.ph | |
| .loc 1 206 68 # 03-matrix-multiplication-cpu.py:206:68 | |
| cltd | |
| .loc 1 0 0 is_stmt 0 # 03-matrix-multiplication-cpu.py:0:0 | |
| sarl $4, %ebx | |
| vxorpd %xmm30, %xmm30, %xmm30 | |
| vpxord %xmm28, %xmm28, %xmm28 | |
| vpxord %xmm29, %xmm29, %xmm29 | |
| vpxord %xmm22, %xmm22, %xmm22 | |
| .loc 1 206 68 # 03-matrix-multiplication-cpu.py:206:68 | |
| idivl %r8d | |
| .loc 1 205 68 is_stmt 1 # 03-matrix-multiplication-cpu.py:205:68 | |
| vpextrd $3, %xmm1, %eax | |
| movl %edx, 96(%rsp) # 4-byte Spill | |
| cltd | |
| idivl %ecx | |
| vpextrd $2, %xmm1, %eax | |
| vpinsrd $3, %edx, %xmm0, %xmm0 | |
| cltd | |
| idivl %ecx | |
| vpextrd $1, %xmm1, %eax | |
| movl %edx, 288(%rsp) # 4-byte Spill | |
| cltd | |
| idivl %ecx | |
| vmovd %xmm1, %eax | |
| vextracti128 $1, %ymm2, %xmm1 | |
| movl %edx, 32(%rsp) # 4-byte Spill | |
| cltd | |
| idivl %ecx | |
| vpextrd $3, %xmm3, %eax | |
| movl %edx, 224(%rsp) # 4-byte Spill | |
| cltd | |
| idivl %ecx | |
| vpextrd $2, %xmm3, %eax | |
| movl %edx, 416(%rsp) # 4-byte Spill | |
| cltd | |
| idivl %ecx | |
| vpextrd $1, %xmm3, %eax | |
| movl %edx, 688(%rsp) # 4-byte Spill | |
| cltd | |
| idivl %ecx | |
| vmovd %xmm3, %eax | |
| movl %edx, 624(%rsp) # 4-byte Spill | |
| cltd | |
| idivl %ecx | |
| vpextrd $3, %xmm1, %eax | |
| movl %edx, 560(%rsp) # 4-byte Spill | |
| cltd | |
| idivl %ecx | |
| vpextrd $2, %xmm1, %eax | |
| movl %edx, %ebp | |
| cltd | |
| idivl %ecx | |
| vpextrd $1, %xmm1, %eax | |
| movl %edx, %r11d | |
| cltd | |
| idivl %ecx | |
| vmovd %xmm1, %eax | |
| movl %edx, %r10d | |
| cltd | |
| idivl %ecx | |
| vpextrd $3, %xmm2, %eax | |
| movl %edx, %r14d | |
| cltd | |
| idivl %ecx | |
| vpextrd $2, %xmm2, %eax | |
| movl %edx, %r8d | |
| cltd | |
| idivl %ecx | |
| vpextrd $1, %xmm2, %eax | |
| movl %edx, %r12d | |
| cltd | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpbroadcastd %r12d, %xmm3 | |
| .loc 1 205 68 # 03-matrix-multiplication-cpu.py:205:68 | |
| idivl %ecx | |
| movl %r15d, %eax | |
| movl %edx, %r13d | |
| cltd | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpbroadcastd %r13d, %xmm2 | |
| movl 3504(%rsp), %r13d | |
| .loc 1 205 68 # 03-matrix-multiplication-cpu.py:205:68 | |
| idivl %ecx | |
| movl 3512(%rsp), %eax | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpbroadcastd %edx, %xmm1 | |
| .loc 1 209 40 # 03-matrix-multiplication-cpu.py:209:40 | |
| vpbroadcastd %eax, %zmm5 | |
| vpmulld .LCPI0_1(%rip){1to16}, %zmm5, %zmm9 | |
| vpmulld .LCPI0_2(%rip){1to16}, %zmm5, %zmm11 | |
| vpmulld .LCPI0_3(%rip){1to16}, %zmm5, %zmm12 | |
| vpmulld .LCPI0_4(%rip){1to16}, %zmm5, %zmm13 | |
| .loc 1 227 33 # 03-matrix-multiplication-cpu.py:227:33 | |
| shll $4, %eax | |
| .loc 1 209 40 # 03-matrix-multiplication-cpu.py:209:40 | |
| vpmulld .LCPI0_5(%rip){1to16}, %zmm5, %zmm14 | |
| vpmulld .LCPI0_6(%rip){1to16}, %zmm5, %zmm15 | |
| vpmulld .LCPI0_8(%rip){1to16}, %zmm5, %zmm17 | |
| vpmulld .LCPI0_7(%rip){1to16}, %zmm5, %zmm16 | |
| vpmulld .LCPI0_9(%rip){1to16}, %zmm5, %zmm10 | |
| vpslld $4, %zmm5, %zmm4 | |
| movl %eax, 160(%rsp) # 4-byte Spill | |
| movl 96(%rsp), %eax # 4-byte Reload | |
| vpslld $3, %zmm5, %zmm6 | |
| vpslld $2, %zmm5, %zmm8 | |
| vpsubd %zmm5, %zmm4, %zmm4 | |
| vpaddd %zmm6, %zmm5, %zmm7 | |
| vpsubd %zmm5, %zmm6, %zmm6 | |
| vpaddd %zmm8, %zmm5, %zmm8 | |
| .loc 1 209 52 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:52 | |
| valignd $15, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
| valignd $15, %zmm6, %zmm6, %zmm19 # zmm19 = zmm6[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
| valignd $15, %zmm7, %zmm7, %zmm20 # zmm20 = zmm7[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
| valignd $15, %zmm8, %zmm8, %zmm18 # zmm18 = zmm8[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
| valignd $15, %zmm9, %zmm9, %zmm6 # zmm6 = zmm9[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
| valignd $15, %zmm11, %zmm11, %zmm7 # zmm7 = zmm11[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
| valignd $15, %zmm12, %zmm12, %zmm8 # zmm8 = zmm12[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
| .loc 1 206 68 is_stmt 1 # 03-matrix-multiplication-cpu.py:206:68 | |
| vmovd %eax, %xmm12 | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| valignd $15, %zmm13, %zmm13, %zmm13 # zmm13 = zmm13[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
| valignd $15, %zmm14, %zmm14, %zmm14 # zmm14 = zmm14[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
| valignd $15, %zmm15, %zmm15, %zmm15 # zmm15 = zmm15[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
| valignd $15, %zmm17, %zmm17, %zmm21 # zmm21 = zmm17[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
| valignd $15, %zmm16, %zmm16, %zmm17 # zmm17 = zmm16[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
| valignd $15, %zmm10, %zmm10, %zmm10 # zmm10 = zmm10[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
| .loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
| cltq | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vpaddd %xmm4, %xmm12, %xmm9 | |
| movq %rax, -96(%rsp) # 8-byte Spill | |
| .loc 1 208 41 is_stmt 1 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpbroadcastd %r8d, %xmm4 | |
| .loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, -96(%rsp) # 8-byte Folded Spill | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vmovd %xmm9, %r15d | |
| .loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
| movslq %r15d, %rcx | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vpaddd %xmm6, %xmm12, %xmm9 | |
| .loc 1 208 41 is_stmt 1 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpbroadcastd %r14d, %xmm6 | |
| movq %rcx, 24(%rsp) # 8-byte Spill | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vpaddd %xmm13, %xmm12, %xmm11 | |
| vpaddd %xmm14, %xmm12, %xmm13 | |
| vpaddd %xmm10, %xmm12, %xmm10 | |
| .loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, 24(%rsp) # 8-byte Folded Spill | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vmovd %xmm9, %r14d | |
| vpaddd %xmm7, %xmm12, %xmm9 | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpbroadcastd %r10d, %xmm7 | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vmovd %xmm9, %r10d | |
| .loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
| movslq %r14d, %rcx | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vpaddd %xmm8, %xmm12, %xmm9 | |
| .loc 1 208 41 is_stmt 1 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpbroadcastd %r11d, %xmm8 | |
| movq %rcx, -8(%rsp) # 8-byte Spill | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vmovd %xmm9, %ecx | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpbroadcastd %ebp, %xmm9 | |
| .loc 1 209 22 # 03-matrix-multiplication-cpu.py:209:22 | |
| movslq %r10d, %r10 | |
| .loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, -8(%rsp) # 8-byte Folded Spill | |
| .loc 1 209 22 # 03-matrix-multiplication-cpu.py:209:22 | |
| movslq %ecx, %rcx | |
| .loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, %r10 | |
| movq %rcx, -16(%rsp) # 8-byte Spill | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vmovd %xmm11, %ecx | |
| movq %r10, 552(%rsp) # 8-byte Spill | |
| movl %ebx, %r10d | |
| .loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, -16(%rsp) # 8-byte Folded Spill | |
| .loc 1 209 22 # 03-matrix-multiplication-cpu.py:209:22 | |
| movslq %ecx, %r11 | |
| movl 560(%rsp), %ecx # 4-byte Reload | |
| .loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, %r11 | |
| movq %r11, 544(%rsp) # 8-byte Spill | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpbroadcastd %ecx, %xmm11 | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vmovd %xmm13, %ecx | |
| vpaddd %xmm20, %xmm12, %xmm13 | |
| .loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
| movslq %ecx, %rcx | |
| movq %rcx, -24(%rsp) # 8-byte Spill | |
| movl 624(%rsp), %ecx # 4-byte Reload | |
| .loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, -24(%rsp) # 8-byte Folded Spill | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpbroadcastd %ecx, %xmm14 | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vmovd %xmm13, %ecx | |
| vpaddd %xmm15, %xmm12, %xmm13 | |
| .loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
| movslq %ecx, %rcx | |
| movq %rcx, 16(%rsp) # 8-byte Spill | |
| movl 688(%rsp), %ecx # 4-byte Reload | |
| .loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, 16(%rsp) # 8-byte Folded Spill | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpbroadcastd %ecx, %xmm15 | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vmovd %xmm13, %ecx | |
| vpaddd %xmm19, %xmm12, %xmm13 | |
| .loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
| movslq %ecx, %rcx | |
| movq %rcx, -40(%rsp) # 8-byte Spill | |
| movl 416(%rsp), %ecx # 4-byte Reload | |
| .loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, -40(%rsp) # 8-byte Folded Spill | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpbroadcastd %ecx, %xmm16 | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vmovd %xmm13, %ecx | |
| vpaddd %xmm17, %xmm12, %xmm13 | |
| .loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
| movslq %ecx, %rcx | |
| movq %rcx, 8(%rsp) # 8-byte Spill | |
| movl 224(%rsp), %ecx # 4-byte Reload | |
| .loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, 8(%rsp) # 8-byte Folded Spill | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpbroadcastd %ecx, %xmm17 | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vmovd %xmm13, %ecx | |
| vpaddd %xmm18, %xmm12, %xmm13 | |
| .loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
| movslq %ecx, %rcx | |
| movq %rcx, -48(%rsp) # 8-byte Spill | |
| movl 32(%rsp), %ecx # 4-byte Reload | |
| .loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, -48(%rsp) # 8-byte Folded Spill | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpbroadcastd %ecx, %xmm18 | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vmovd %xmm13, %ecx | |
| vpaddd %xmm21, %xmm12, %xmm13 | |
| vpxord %xmm21, %xmm21, %xmm21 | |
| .loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
| movslq %ecx, %rcx | |
| movq %rcx, (%rsp) # 8-byte Spill | |
| movl 288(%rsp), %ecx # 4-byte Reload | |
| .loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, (%rsp) # 8-byte Folded Spill | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpbroadcastd %ecx, %xmm19 | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vmovd %xmm13, %ecx | |
| vpaddd %xmm5, %xmm5, %xmm13 | |
| .loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
| movslq %ecx, %rcx | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vpaddd %xmm5, %xmm13, %xmm13 | |
| vpaddd %xmm5, %xmm12, %xmm5 | |
| movq %rcx, -56(%rsp) # 8-byte Spill | |
| movb $-64, %cl | |
| vpaddd %xmm13, %xmm12, %xmm20 | |
| .loc 1 208 41 is_stmt 1 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpbroadcastd %r13d, %xmm13 | |
| .loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, -56(%rsp) # 8-byte Folded Spill | |
| vpxor %xmm12, %xmm12, %xmm12 | |
| kmovd %ecx, %k4 | |
| movw $512, %cx # imm = 0x200 | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpmulld %xmm13, %xmm0, %xmm0 | |
| vpmulld %xmm13, %xmm3, %xmm3 | |
| vpmulld %xmm13, %xmm2, %xmm2 | |
| kmovd %ecx, %k1 | |
| movb $32, %cl | |
| kmovw %k1, -114(%rsp) # 2-byte Spill | |
| kmovd %ecx, %k1 | |
| movw $2048, %cx # imm = 0x800 | |
| kmovw %k1, -116(%rsp) # 2-byte Spill | |
| kmovd %ecx, %k1 | |
| movb $64, %cl | |
| vpextrd $3, %xmm3, %ebp | |
| vbroadcasti32x4 .LCPI0_11(%rip), %zmm3 # zmm3 = [7,23,6,7,7,23,6,7,7,23,6,7,7,23,6,7] | |
| # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
| kmovw %k1, -118(%rsp) # 2-byte Spill | |
| kmovd %ecx, %k1 | |
| movw $8192, %cx # imm = 0x2000 | |
| kmovw %k1, -120(%rsp) # 2-byte Spill | |
| kmovd %ecx, %k1 | |
| movb $-128, %cl | |
| kmovw %k1, -122(%rsp) # 2-byte Spill | |
| kmovd %ecx, %k1 | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vmovd %xmm20, %ecx | |
| .loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
| movslq %ecx, %r13 | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vmovd %xmm10, %ecx | |
| .loc 1 208 41 is_stmt 1 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpmulld %xmm13, %xmm19, %xmm10 | |
| vpxord %xmm19, %xmm19, %xmm19 | |
| kmovw %k1, -124(%rsp) # 2-byte Spill | |
| .loc 1 209 22 # 03-matrix-multiplication-cpu.py:209:22 | |
| movslq %ecx, %rcx | |
| .loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, %r13 | |
| movq %rcx, -64(%rsp) # 8-byte Spill | |
| .loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
| vmovd %xmm5, %ecx | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpmulld %xmm13, %xmm18, %xmm5 | |
| movq %r13, 536(%rsp) # 8-byte Spill | |
| .loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, -64(%rsp) # 8-byte Folded Spill | |
| vpxord %xmm18, %xmm18, %xmm18 | |
| .loc 1 209 22 # 03-matrix-multiplication-cpu.py:209:22 | |
| movslq %ecx, %rdx | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpextrd $3, %xmm0, %ecx | |
| vbroadcasti32x4 .LCPI0_28(%rip), %zmm0 # zmm0 = [5,21,6,7,5,21,6,7,5,21,6,7,5,21,6,7] | |
| # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
| .loc 1 208 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:208:22 | |
| movslq %ecx, %rax | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpextrd $3, %xmm10, %ecx | |
| vpmulld %xmm13, %xmm17, %xmm10 | |
| vmovdqu64 %zmm3, 2352(%rsp) # 64-byte Spill | |
| vbroadcasti32x4 .LCPI0_23(%rip), %zmm3 # zmm3 = [4,5,6,22,4,5,6,22,4,5,6,22,4,5,6,22] | |
| # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
| .loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, %rdx | |
| movq %rax, -72(%rsp) # 8-byte Spill | |
| .loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
| movslq %ecx, %rax | |
| .loc 1 208 41 is_stmt 0 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpextrd $3, %xmm5, %ecx | |
| vpmulld %xmm13, %xmm16, %xmm5 | |
| movq %rdx, 528(%rsp) # 8-byte Spill | |
| .loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, -72(%rsp) # 8-byte Folded Spill | |
| vpxord %xmm16, %xmm16, %xmm16 | |
| movq %rax, -88(%rsp) # 8-byte Spill | |
| .loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
| movslq %ecx, %rax | |
| .loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, -88(%rsp) # 8-byte Folded Spill | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpextrd $3, %xmm10, %ecx | |
| vpmulld %xmm13, %xmm15, %xmm10 | |
| movq %rax, -80(%rsp) # 8-byte Spill | |
| .loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, -80(%rsp) # 8-byte Folded Spill | |
| vpxor %xmm15, %xmm15, %xmm15 | |
| .loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
| movslq %ecx, %r15 | |
| .loc 1 208 41 is_stmt 0 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpextrd $3, %xmm5, %ecx | |
| vpmulld %xmm13, %xmm14, %xmm5 | |
| .loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
| movslq %ecx, %r12 | |
| .loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, %r15 | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpextrd $3, %xmm10, %ecx | |
| vpmulld %xmm13, %xmm11, %xmm10 | |
| movq %r15, 520(%rsp) # 8-byte Spill | |
| .loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, %r12 | |
| .loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
| movslq %ecx, %rax | |
| .loc 1 208 41 is_stmt 0 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpextrd $3, %xmm5, %ecx | |
| vpmulld %xmm13, %xmm9, %xmm5 | |
| vmovdqu64 %zmm0, 2096(%rsp) # 64-byte Spill | |
| vbroadcasti128 .LCPI0_29(%rip), %ymm0 # ymm0 = [5,13,6,7,5,13,6,7] | |
| # ymm0 = mem[0,1,0,1] | |
| movq %r12, 512(%rsp) # 8-byte Spill | |
| vpxor %xmm9, %xmm9, %xmm9 | |
| movl $65535, %r12d # imm = 0xFFFF | |
| movq %rax, -32(%rsp) # 8-byte Spill | |
| .loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
| movslq %ecx, %rax | |
| .loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, -32(%rsp) # 8-byte Folded Spill | |
| vmovdqu64 %zmm3, 2224(%rsp) # 64-byte Spill | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpextrd $3, %xmm10, %ecx | |
| movq %rax, -104(%rsp) # 8-byte Spill | |
| vpmulld %xmm13, %xmm8, %xmm10 | |
| .loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, -104(%rsp) # 8-byte Folded Spill | |
| vpxor %xmm8, %xmm8, %xmm8 | |
| .loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
| movslq %ecx, %rax | |
| .loc 1 208 41 is_stmt 0 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpextrd $3, %xmm5, %ecx | |
| vpmulld %xmm13, %xmm7, %xmm5 | |
| vpmulld %xmm13, %xmm6, %xmm7 | |
| vpxor %xmm6, %xmm6, %xmm6 | |
| movq %rax, 96(%rsp) # 8-byte Spill | |
| .loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
| movslq %ecx, %rax | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpextrd $3, %xmm10, %ecx | |
| movq %rax, 32(%rsp) # 8-byte Spill | |
| movq 96(%rsp), %rbx # 8-byte Reload | |
| vpextrd $3, %xmm5, %r8d | |
| vpmulld %xmm13, %xmm4, %xmm5 | |
| .loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
| movslq %ecx, %rax | |
| vbroadcasti128 .LCPI0_12(%rip), %ymm4 # ymm4 = [7,15,6,7,7,15,6,7] | |
| # ymm4 = mem[0,1,0,1] | |
| movq %rax, 224(%rsp) # 8-byte Spill | |
| movslq %r8d, %rax | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpextrd $3, %xmm7, %r8d | |
| .loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
| movslq %r8d, %r14 | |
| movq %rax, 416(%rsp) # 8-byte Spill | |
| movq 224(%rsp), %r13 # 8-byte Reload | |
| vmovdqu %ymm0, 784(%rsp) # 32-byte Spill | |
| vpxor %xmm0, %xmm0, %xmm0 | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpextrd $3, %xmm5, %r8d | |
| vpmulld %xmm13, %xmm1, %xmm5 | |
| movq 416(%rsp), %r11 # 8-byte Reload | |
| .loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, %r14 | |
| vpxor %xmm1, %xmm1, %xmm1 | |
| vmovdqu64 %zmm0, 224(%rsp) # 64-byte Spill | |
| .loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
| movslq %r8d, %rax | |
| .loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, %rbx | |
| movslq 160(%rsp), %r8 # 4-byte Folded Reload | |
| movq %rax, 288(%rsp) # 8-byte Spill | |
| .loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
| movslq %ebp, %rax | |
| .loc 1 208 41 is_stmt 0 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpextrd $3, %xmm2, %ebp | |
| vbroadcasti32x4 .LCPI0_21(%rip), %zmm2 # zmm2 = [0,8,0,8,0,8,0,8] | |
| # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
| movq %rbx, %r15 | |
| movl %r10d, %ebx | |
| movq %rax, 624(%rsp) # 8-byte Spill | |
| .loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
| movslq %ebp, %rax | |
| .loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
| vpextrd $3, %xmm5, %ebp | |
| .loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
| movslq %ebp, %rcx | |
| movq 624(%rsp), %rbp # 8-byte Reload | |
| .loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
| shlq $2, %r13 | |
| shlq $2, %rax | |
| movq %rcx, 688(%rsp) # 8-byte Spill | |
| movq 32(%rsp), %rcx # 8-byte Reload | |
| shlq $2, %r11 | |
| vmovdqu %ymm4, 752(%rsp) # 32-byte Spill | |
| vmovdqu64 %zmm0, 32(%rsp) # 64-byte Spill | |
| movq 688(%rsp), %r10 # 8-byte Reload | |
| shlq $2, %r8 | |
| movq %r8, 504(%rsp) # 8-byte Spill | |
| movq %rax, %r8 | |
| shlq $2, %rbp | |
| vmovdqu64 %zmm2, 2288(%rsp) # 64-byte Spill | |
| vbroadcasti32x4 .LCPI0_25(%rip), %zmm2 # zmm2 = [4,5,4,20,4,5,4,20,4,5,4,20,4,5,4,20] | |
| # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
| shlq $2, %rcx | |
| movq %rcx, %rdx | |
| movq 288(%rsp), %rcx # 8-byte Reload | |
| shlq $2, %r10 | |
| shlq $2, %rcx | |
| vmovdqu64 %zmm2, 2160(%rsp) # 64-byte Spill | |
| .loc 1 0 22 is_stmt 0 # :0:22 | |
| .Ltmp11: | |
| .p2align 4, 0x90 | |
| .LBB0_3: # =>This Inner Loop Header: Depth=1 | |
| vmovdqa64 %zmm29, %zmm26 | |
| vpunpckldq %ymm19, %ymm26, %ymm0 # ymm0 = ymm26[0],ymm19[0],ymm26[1],ymm19[1],ymm26[4],ymm19[4],ymm26[5],ymm19[5] | |
| vmovups 2352(%rsp), %zmm7 # 64-byte Reload | |
| vmovdqa64 %ymm26, %ymm2 | |
| vmovdqa64 %zmm9, %zmm27 | |
| vmovapd %ymm30, %ymm23 | |
| vmovdqa64 %zmm6, %zmm17 | |
| vmovdqa64 %zmm21, %zmm24 | |
| vmovdqa64 %zmm15, %zmm31 | |
| vpunpckhdq %ymm31, %ymm24, %ymm15 # ymm15 = ymm24[2],ymm31[2],ymm24[3],ymm31[3],ymm24[6],ymm31[6],ymm24[7],ymm31[7] | |
| vpunpckhdq %ymm18, %ymm22, %ymm20 # ymm20 = ymm22[2],ymm18[2],ymm22[3],ymm18[3],ymm22[6],ymm18[6],ymm22[7],ymm18[7] | |
| vpunpckhdq %ymm19, %ymm26, %ymm3 # ymm3 = ymm26[2],ymm19[2],ymm26[3],ymm19[3],ymm26[6],ymm19[6],ymm26[7],ymm19[7] | |
| vmovdqa64 %zmm29, %zmm4 | |
| vmovapd %zmm30, %zmm29 | |
| vpunpckldq %ymm27, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm27[0],ymm12[1],ymm27[1],ymm12[4],ymm27[4],ymm12[5],ymm27[5] | |
| vmovdqa64 %zmm12, %zmm25 | |
| vmovdqa64 %ymm26, %ymm5 | |
| vmovdqa64 %zmm12, %zmm11 | |
| vmovdqu64 %zmm1, 160(%rsp) # 64-byte Spill | |
| vmovdqa64 %zmm12, %zmm13 | |
| .loc 1 221 51 is_stmt 1 # 03-matrix-multiplication-cpu.py:221:51 | |
| testl %r9d, %r9d | |
| .loc 1 221 20 is_stmt 0 # 03-matrix-multiplication-cpu.py:221:20 | |
| movl $0, %eax | |
| vmovdqu %ymm0, 96(%rsp) # 32-byte Spill | |
| vmovaps .LCPI0_10(%rip), %ymm0 # ymm0 = [3,11,2,3,7,15,6,7] | |
| cmovgl %r12d, %eax | |
| kmovd %eax, %k2 | |
| movq -104(%rsp), %rax # 8-byte Reload | |
| vpermt2ps %ymm19, %ymm0, %ymm2 | |
| vpunpckhdq %zmm9, %zmm12, %zmm0 # zmm0 = zmm12[2],zmm9[2],zmm12[3],zmm9[3],zmm12[6],zmm9[6],zmm12[7],zmm9[7],zmm12[10],zmm9[10],zmm12[11],zmm9[11],zmm12[14],zmm9[14],zmm12[15],zmm9[15] | |
| vpunpckhdq %ymm27, %ymm12, %ymm9 # ymm9 = ymm12[2],ymm27[2],ymm12[3],ymm27[3],ymm12[6],ymm27[6],ymm12[7],ymm27[7] | |
| vshuff64x2 $85, %zmm15, %zmm15, %zmm15 # zmm15 = zmm15[2,3,2,3,2,3,2,3] | |
| vinserti64x4 $1, %ymm20, %zmm0, %zmm20 | |
| vmovups %zmm0, 352(%rsp) # 64-byte Spill | |
| vinsertf64x4 $1, %ymm2, %zmm0, %zmm14 | |
| vmovdqa64 %zmm12, %zmm2 | |
| vpermt2ps %zmm27, %zmm7, %zmm2 | |
| vmovups 752(%rsp), %ymm7 # 32-byte Reload | |
| vmovapd %zmm30, %zmm0 | |
| vshuff64x2 $85, %zmm9, %zmm9, %zmm9 # zmm9 = zmm9[2,3,2,3,2,3,2,3] | |
| vmovups 224(%rsp), %zmm30 # 64-byte Reload | |
| vshuff64x2 $85, %zmm10, %zmm10, %zmm10 # zmm10 = zmm10[2,3,2,3,2,3,2,3] | |
| vshufpd $32, %zmm15, %zmm2, %zmm2 # zmm2 = zmm2[0],zmm15[0],zmm2[2],zmm15[2],zmm2[4],zmm15[5],zmm2[6],zmm15[6] | |
| vmovups 2224(%rsp), %zmm15 # 64-byte Reload | |
| vshufpd $128, %zmm20, %zmm14, %zmm2 {%k4} # zmm2 {%k4} = zmm14[0],zmm20[0],zmm14[2],zmm20[2],zmm14[4],zmm20[4],zmm14[6],zmm20[7] | |
| vmovdqa %ymm1, %ymm14 | |
| vpermt2ps %ymm6, %ymm7, %ymm23 | |
| vmovaps .LCPI0_14(%rip), %ymm6 # ymm6 = [3,11,2,3,7,15,u,u] | |
| vextractf32x4 $1, %ymm23, %xmm7 | |
| vpermt2ps %ymm8, %ymm6, %ymm14 | |
| vpunpckhdq %ymm16, %ymm28, %ymm6 # ymm6 = ymm28[2],ymm16[2],ymm28[3],ymm16[3],ymm28[6],ymm16[6],ymm28[7],ymm16[7] | |
| vblendps $192, %ymm6, %ymm14, %ymm6 # ymm6 = ymm14[0,1,2,3,4,5],ymm6[6,7] | |
| vmovdqa64 %zmm21, %zmm14 | |
| vmovaps %zmm8, %zmm21 | |
| vmovaps .LCPI0_24(%rip), %ymm8 # ymm8 = [0,1,2,10,4,5,6,14] | |
| vpermt2ps %zmm31, %zmm15, %zmm14 | |
| vmovdqa64 %zmm26, %zmm15 | |
| vpunpckldq %xmm27, %xmm12, %xmm26 # xmm26 = xmm12[0],xmm27[0],xmm12[1],xmm27[1] | |
| vshufpd $32, %zmm14, %zmm9, %zmm20 # zmm20 = zmm9[0],zmm14[0],zmm9[2],zmm14[2],zmm9[4],zmm14[5],zmm9[6],zmm14[6] | |
| vmovdqa64 %ymm22, %ymm9 | |
| vinsertf64x4 $1, %ymm3, %zmm0, %zmm14 | |
| vbroadcastsd .LCPI0_13(%rip), %ymm3 # ymm3 = [7,15,7,15,7,15,7,15] | |
| vpermt2ps %ymm18, %ymm8, %ymm9 | |
| vmovaps .LCPI0_15(%rip), %zmm8 # zmm8 = [3,19,2,3,7,23,6,7,11,27,10,11,15,31,14,15] | |
| vinsertf64x4 $1, %ymm9, %zmm0, %zmm9 | |
| vshufpd $128, %zmm9, %zmm14, %zmm20 {%k4} # zmm20 {%k4} = zmm14[0],zmm9[0],zmm14[2],zmm9[2],zmm14[4],zmm9[4],zmm14[6],zmm9[7] | |
| vmovups 32(%rsp), %zmm14 # 64-byte Reload | |
| vmovupd %zmm20, 288(%rsp) # 64-byte Spill | |
| vpermt2ps %zmm27, %zmm8, %zmm25 | |
| vmovaps %ymm14, %ymm9 | |
| vpermt2ps %ymm30, %ymm3, %ymm9 | |
| vmovups 2160(%rsp), %zmm3 # 64-byte Reload | |
| vunpckhps %zmm30, %zmm14, %zmm23 # zmm23 = zmm14[2],zmm30[2],zmm14[3],zmm30[3],zmm14[6],zmm30[6],zmm14[7],zmm30[7],zmm14[10],zmm30[10],zmm14[11],zmm30[11],zmm14[14],zmm30[14],zmm14[15],zmm30[15] | |
| vmovups %zmm23, 1584(%rsp) # 64-byte Spill | |
| vblendps $3, %xmm7, %xmm9, %xmm7 # xmm7 = xmm7[0,1],xmm9[2,3] | |
| vmovdqa64 %zmm24, %zmm9 | |
| vblendps $15, %ymm7, %ymm6, %ymm6 # ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] | |
| vpunpckldq %ymm31, %ymm24, %ymm7 # ymm7 = ymm24[0],ymm31[0],ymm24[1],ymm31[1],ymm24[4],ymm31[4],ymm24[5],ymm31[5] | |
| vinsertf64x4 $0, %ymm6, %zmm2, %zmm2 | |
| vpermt2ps %zmm31, %zmm3, %zmm9 | |
| vmovaps .LCPI0_26(%rip), %ymm3 # ymm3 = [0,1,0,8,4,5,4,12] | |
| vmovupd %zmm2, 1072(%rsp) # 64-byte Spill | |
| vmovdqa64 %zmm24, %zmm2 | |
| vshufpd $32, %zmm9, %zmm10, %zmm20 # zmm20 = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[5],zmm10[6],zmm9[6] | |
| vinsertf64x4 $1, 96(%rsp), %zmm0, %zmm10 # 32-byte Folded Reload | |
| vmovdqa64 %ymm22, %ymm9 | |
| vpermt2ps %ymm18, %ymm3, %ymm9 | |
| vinsertf64x4 $1, %ymm9, %zmm0, %zmm9 | |
| vpermt2ps %zmm17, %zmm8, %zmm0 | |
| vshufpd $128, %zmm9, %zmm10, %zmm20 {%k4} # zmm20 {%k4} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[7] | |
| vpunpckhdq %zmm31, %zmm24, %zmm9 # zmm9 = zmm24[2],zmm31[2],zmm24[3],zmm31[3],zmm24[6],zmm31[6],zmm24[7],zmm31[7],zmm24[10],zmm31[10],zmm24[11],zmm31[11],zmm24[14],zmm31[14],zmm24[15],zmm31[15] | |
| vshufpd $32, %zmm9, %zmm25, %zmm3 # zmm3 = zmm25[0],zmm9[0],zmm25[2],zmm9[2],zmm25[4],zmm9[5],zmm25[6],zmm9[6] | |
| vmovupd %zmm20, 560(%rsp) # 64-byte Spill | |
| vmovdqa64 %zmm15, %zmm20 | |
| vmovupd %zmm3, 96(%rsp) # 64-byte Spill | |
| vmovapd .LCPI0_16(%rip), %zmm3 # zmm3 = [2,10,2,10,6,15,6,14] | |
| vshuff64x2 $85, %zmm7, %zmm7, %zmm7 # zmm7 = zmm7[2,3,2,3,2,3,2,3] | |
| vmovups %zmm0, 1776(%rsp) # 64-byte Spill | |
| vpermt2pd %zmm9, %zmm3, %zmm25 | |
| vmovapd %zmm3, %zmm10 | |
| vmovaps .LCPI0_27(%rip), %ymm3 # ymm3 = [1,9,2,3,5,13,6,7] | |
| vextractf32x4 $3, %zmm0, %xmm9 | |
| vmovupd %zmm25, 416(%rsp) # 64-byte Spill | |
| vmovaps %zmm8, %zmm25 | |
| vpermt2ps %zmm19, %zmm25, %zmm20 | |
| vunpcklps %zmm17, %zmm29, %zmm25 # zmm25 = zmm29[0],zmm17[0],zmm29[1],zmm17[1],zmm29[4],zmm17[4],zmm29[5],zmm17[5],zmm29[8],zmm17[8],zmm29[9],zmm17[9],zmm29[12],zmm17[12],zmm29[13],zmm17[13] | |
| vpermt2ps %ymm19, %ymm3, %ymm5 | |
| vmovups 2096(%rsp), %zmm3 # 64-byte Reload | |
| vinsertf64x4 $1, %ymm5, %zmm0, %zmm5 | |
| vpermt2ps %zmm27, %zmm3, %zmm11 | |
| vshufpd $32, %zmm7, %zmm11, %zmm3 # zmm3 = zmm11[0],zmm7[0],zmm11[2],zmm7[2],zmm11[4],zmm7[5],zmm11[6],zmm7[6] | |
| vpunpckldq %ymm18, %ymm22, %ymm7 # ymm7 = ymm22[0],ymm18[0],ymm22[1],ymm18[1],ymm22[4],ymm18[4],ymm22[5],ymm18[5] | |
| vpunpckldq %xmm19, %xmm15, %xmm11 # xmm11 = xmm15[0],xmm19[0],xmm15[1],xmm19[1] | |
| vinsertf64x4 $1, %ymm7, %zmm0, %zmm7 | |
| vmovups 160(%rsp), %zmm0 # 64-byte Reload | |
| vshufpd $128, %zmm7, %zmm5, %zmm3 {%k4} # zmm3 {%k4} = zmm5[0],zmm7[0],zmm5[2],zmm7[2],zmm5[4],zmm7[4],zmm5[6],zmm7[7] | |
| vmovdqa64 %zmm24, %zmm7 | |
| vextractf32x4 $3, %zmm23, %xmm5 | |
| vmovdqa64 %zmm12, %zmm23 | |
| vmovupd %zmm3, 1328(%rsp) # 64-byte Spill | |
| vbroadcastsd .LCPI0_32(%rip), %zmm3 # zmm3 = [3,19,3,19,3,19,3,19,3,19,3,19,3,19,3,19] | |
| vpermt2ps %zmm21, %zmm8, %zmm0 | |
| vblendps $3, %xmm9, %xmm5, %xmm8 # xmm8 = xmm9[0,1],xmm5[2,3] | |
| vpermt2ps %zmm27, %zmm3, %zmm13 | |
| vpermt2ps %zmm31, %zmm3, %zmm7 | |
| vpermt2ps %zmm19, %zmm3, %zmm4 | |
| vmovups %zmm0, 160(%rsp) # 64-byte Spill | |
| vextractf64x4 $1, %zmm0, %ymm9 | |
| vpunpckhdq %zmm16, %zmm28, %zmm0 # zmm0 = zmm28[2],zmm16[2],zmm28[3],zmm16[3],zmm28[6],zmm16[6],zmm28[7],zmm16[7],zmm28[10],zmm16[10],zmm28[11],zmm16[11],zmm28[14],zmm16[14],zmm28[15],zmm16[15] | |
| vmovdqu64 %zmm0, 3184(%rsp) # 64-byte Spill | |
| vshufpd $32, %zmm7, %zmm13, %zmm5 # zmm5 = zmm13[0],zmm7[0],zmm13[2],zmm7[2],zmm13[4],zmm7[5],zmm13[6],zmm7[6] | |
| vmovdqa64 %zmm22, %zmm7 | |
| vpermt2ps %zmm18, %zmm3, %zmm7 | |
| vmovaps .LCPI0_17(%rip), %zmm3 # zmm3 = [0,1,2,18,4,5,6,22,8,9,10,26,12,13,14,30] | |
| vshufpd $128, %zmm7, %zmm4, %zmm5 {%k4} # zmm5 {%k4} = zmm4[0],zmm7[0],zmm4[2],zmm7[2],zmm4[4],zmm7[4],zmm4[6],zmm7[7] | |
| vextracti64x4 $1, %zmm0, %ymm4 | |
| vmovupd 352(%rsp), %zmm0 # 64-byte Reload | |
| vmovapd %zmm10, %zmm7 | |
| vmovupd %zmm5, 688(%rsp) # 64-byte Spill | |
| vblendps $192, %ymm4, %ymm9, %ymm4 # ymm4 = ymm9[0,1,2,3,4,5],ymm4[6,7] | |
| vpunpckldq %zmm27, %zmm12, %zmm9 # zmm9 = zmm12[0],zmm27[0],zmm12[1],zmm27[1],zmm12[4],zmm27[4],zmm12[5],zmm27[5],zmm12[8],zmm27[8],zmm12[9],zmm27[9],zmm12[12],zmm27[12],zmm12[13],zmm27[13] | |
| vpermt2ps %zmm31, %zmm3, %zmm2 | |
| vmovaps %zmm3, %zmm6 | |
| vshufpd $32, %zmm2, %zmm0, %zmm5 # zmm5 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[5],zmm0[6],zmm2[6] | |
| vpermt2pd %zmm2, %zmm10, %zmm0 | |
| vunpckhps %zmm17, %zmm29, %zmm2 # zmm2 = zmm29[2],zmm17[2],zmm29[3],zmm17[3],zmm29[6],zmm17[6],zmm29[7],zmm17[7],zmm29[10],zmm17[10],zmm29[11],zmm17[11],zmm29[14],zmm17[14],zmm29[15],zmm17[15] | |
| vmovupd %zmm5, 624(%rsp) # 64-byte Spill | |
| vmovaps %zmm14, %zmm5 | |
| vpermt2ps %zmm30, %zmm3, %zmm5 | |
| vmovupd %zmm2, 3056(%rsp) # 64-byte Spill | |
| vextractf32x4 $3, %zmm2, %xmm2 | |
| vmovaps .LCPI0_18(%rip), %zmm3 # zmm3 = [1,17,2,3,5,21,6,7,9,25,10,11,13,29,14,15] | |
| vmovupd %zmm0, 1840(%rsp) # 64-byte Spill | |
| vextractf32x4 $3, %zmm5, %xmm0 | |
| vblendps $3, %xmm2, %xmm0, %xmm0 # xmm0 = xmm2[0,1],xmm0[2,3] | |
| vmovups %ymm0, 3120(%rsp) # 32-byte Spill | |
| vpunpckhdq %xmm27, %xmm12, %xmm0 # xmm0 = xmm12[2],xmm27[2],xmm12[3],xmm27[3] | |
| vpermt2ps %zmm27, %zmm3, %zmm23 | |
| vmovdqu64 %zmm0, 1136(%rsp) # 64-byte Spill | |
| vinsertps $76, %xmm12, %xmm27, %xmm0 # xmm0 = xmm12[1],xmm27[1],zero,zero | |
| vmovdqa64 %zmm15, %zmm27 | |
| vpermt2ps %zmm19, %zmm3, %zmm27 | |
| vmovdqa64 %zmm9, %zmm12 | |
| vmovups %zmm0, 1200(%rsp) # 64-byte Spill | |
| vblendps $15, %ymm8, %ymm4, %ymm0 # ymm0 = ymm8[0,1,2,3],ymm4[4,5,6,7] | |
| vpunpckhdq %zmm21, %zmm1, %zmm4 # zmm4 = zmm1[2],zmm21[2],zmm1[3],zmm21[3],zmm1[6],zmm21[6],zmm1[7],zmm21[7],zmm1[10],zmm21[10],zmm1[11],zmm21[11],zmm1[14],zmm21[14],zmm1[15],zmm21[15] | |
| vmovaps %zmm6, %zmm8 | |
| vmovups %ymm0, 880(%rsp) # 32-byte Spill | |
| vmovdqa64 %zmm28, %zmm0 | |
| vpermt2ps %zmm16, %zmm6, %zmm0 | |
| vmovdqu64 %zmm4, 2736(%rsp) # 64-byte Spill | |
| vextracti64x4 $1, %zmm4, %ymm4 | |
| vextractf64x4 $1, %zmm0, %ymm2 | |
| vpblendd $192, %ymm2, %ymm4, %ymm2 # ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] | |
| vmovdqu %ymm2, 2800(%rsp) # 32-byte Spill | |
| vpunpckldq %zmm31, %zmm24, %zmm2 # zmm2 = zmm24[0],zmm31[0],zmm24[1],zmm31[1],zmm24[4],zmm31[4],zmm24[5],zmm31[5],zmm24[8],zmm31[8],zmm24[9],zmm31[9],zmm24[12],zmm31[12],zmm24[13],zmm31[13] | |
| vshufpd $32, %zmm2, %zmm23, %zmm4 # zmm4 = zmm23[0],zmm2[0],zmm23[2],zmm2[2],zmm23[4],zmm2[5],zmm23[6],zmm2[6] | |
| vpermt2pd %zmm2, %zmm10, %zmm23 | |
| vpunpckhdq %zmm19, %zmm15, %zmm2 # zmm2 = zmm15[2],zmm19[2],zmm15[3],zmm19[3],zmm15[6],zmm19[6],zmm15[7],zmm19[7],zmm15[10],zmm19[10],zmm15[11],zmm19[11],zmm15[14],zmm19[14],zmm15[15],zmm19[15] | |
| vunpcklps %zmm30, %zmm14, %zmm10 # zmm10 = zmm14[0],zmm30[0],zmm14[1],zmm30[1],zmm14[4],zmm30[4],zmm14[5],zmm30[5],zmm14[8],zmm30[8],zmm14[9],zmm30[9],zmm14[12],zmm30[12],zmm14[13],zmm30[13] | |
| vmovdqu64 %zmm2, 2928(%rsp) # 64-byte Spill | |
| vpunpckldq %zmm19, %zmm15, %zmm2 # zmm2 = zmm15[0],zmm19[0],zmm15[1],zmm19[1],zmm15[4],zmm19[4],zmm15[5],zmm19[5],zmm15[8],zmm19[8],zmm15[9],zmm19[9],zmm15[12],zmm19[12],zmm15[13],zmm19[13] | |
| vmovupd %zmm4, 1264(%rsp) # 64-byte Spill | |
| vmovdqu64 %zmm2, 816(%rsp) # 64-byte Spill | |
| vpunpckhdq %xmm19, %xmm15, %xmm2 # xmm2 = xmm15[2],xmm19[2],xmm15[3],xmm19[3] | |
| vmovdqu64 %zmm2, 1968(%rsp) # 64-byte Spill | |
| vinsertps $76, %xmm15, %xmm19, %xmm2 # xmm2 = xmm15[1],xmm19[1],zero,zero | |
| vmovapd %zmm29, %zmm19 | |
| vpermt2ps %zmm17, %zmm3, %zmm19 | |
| vmovups %zmm2, 1008(%rsp) # 64-byte Spill | |
| vextractf32x4 $3, %zmm10, %xmm2 | |
| vextractf32x4 $2, %zmm10, %xmm10 | |
| vextractf32x4 $3, %zmm19, %xmm4 | |
| vblendps $3, %xmm4, %xmm2, %xmm4 # xmm4 = xmm4[0,1],xmm2[2,3] | |
| vmovdqa64 %zmm1, %zmm2 | |
| vpermt2ps %zmm21, %zmm3, %zmm2 | |
| vpunpckldq %zmm16, %zmm28, %zmm3 # zmm3 = zmm28[0],zmm16[0],zmm28[1],zmm16[1],zmm28[4],zmm16[4],zmm28[5],zmm16[5],zmm28[8],zmm16[8],zmm28[9],zmm16[9],zmm28[12],zmm16[12],zmm28[13],zmm16[13] | |
| vmovdqu64 %zmm3, 2544(%rsp) # 64-byte Spill | |
| vextracti64x4 $1, %zmm3, %ymm3 | |
| vextractf64x4 $1, %zmm2, %ymm15 | |
| vblendps $192, %ymm3, %ymm15, %ymm3 # ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7] | |
| vextractf32x4 $3, %zmm25, %xmm15 | |
| vblendps $15, %ymm4, %ymm3, %ymm3 # ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] | |
| vpunpckldq %xmm31, %xmm24, %xmm4 # xmm4 = xmm24[0],xmm31[0],xmm24[1],xmm31[1] | |
| vmovdqu64 %zmm4, 2608(%rsp) # 64-byte Spill | |
| vpunpckhdq %xmm31, %xmm24, %xmm4 # xmm4 = xmm24[2],xmm31[2],xmm24[3],xmm31[3] | |
| vmovups %ymm3, 1712(%rsp) # 32-byte Spill | |
| vmovdqa64 %zmm24, %zmm3 | |
| vmovdqu64 %zmm4, 1648(%rsp) # 64-byte Spill | |
| vinsertps $76, %xmm24, %xmm31, %xmm4 # xmm4 = xmm24[1],xmm31[1],zero,zero | |
| vmovaps %zmm14, %zmm24 | |
| vmovups %zmm4, 944(%rsp) # 64-byte Spill | |
| vmovaps .LCPI0_19(%rip), %zmm4 # zmm4 = [0,1,0,16,4,5,4,20,8,9,8,24,12,13,12,28] | |
| vpermt2ps %zmm31, %zmm4, %zmm3 | |
| vpermt2ps %zmm30, %zmm4, %zmm24 | |
| vmovaps %zmm14, %zmm31 | |
| vpunpcklqdq %ymm28, %ymm16, %ymm14 # ymm14 = ymm16[0],ymm28[0],ymm16[2],ymm28[2] | |
| vmovdqu %ymm14, 2672(%rsp) # 32-byte Spill | |
| vpunpckldq %ymm16, %ymm28, %ymm14 # ymm14 = ymm28[0],ymm16[0],ymm28[1],ymm16[1],ymm28[4],ymm16[4],ymm28[5],ymm16[5] | |
| vmovdqu %ymm14, 2864(%rsp) # 32-byte Spill | |
| vinsertps $179, %xmm16, %xmm28, %xmm14 # xmm14 = zero,zero,xmm28[2],xmm16[2] | |
| vmovaps %xmm14, 2992(%rsp) # 16-byte Spill | |
| vunpckhps %xmm16, %xmm28, %xmm14 # xmm14 = xmm28[2],xmm16[2],xmm28[3],xmm16[3] | |
| vmovaps %xmm14, 1520(%rsp) # 16-byte Spill | |
| vunpcklps %xmm16, %xmm28, %xmm14 # xmm14 = xmm28[0],xmm16[0],xmm28[1],xmm16[1] | |
| vshufpd $32, %zmm3, %zmm9, %zmm6 # zmm6 = zmm9[0],zmm3[0],zmm9[2],zmm3[2],zmm9[4],zmm3[5],zmm9[6],zmm3[6] | |
| vpermt2pd %zmm3, %zmm7, %zmm12 | |
| vmovdqa64 %zmm22, %zmm7 | |
| vpunpckldq %zmm18, %zmm22, %zmm9 # zmm9 = zmm22[0],zmm18[0],zmm22[1],zmm18[1],zmm22[4],zmm18[4],zmm22[5],zmm18[5],zmm22[8],zmm18[8],zmm22[9],zmm18[9],zmm22[12],zmm18[12],zmm22[13],zmm18[13] | |
| vpunpckhdq %zmm18, %zmm22, %zmm3 # zmm3 = zmm22[2],zmm18[2],zmm22[3],zmm18[3],zmm22[6],zmm18[6],zmm22[7],zmm18[7],zmm22[10],zmm18[10],zmm22[11],zmm18[11],zmm22[14],zmm18[14],zmm22[15],zmm18[15] | |
| vpermt2ps %zmm18, %zmm4, %zmm7 | |
| vextractf32x4 $3, %zmm24, %xmm13 | |
| vmovaps %xmm14, 1456(%rsp) # 16-byte Spill | |
| vmovupd %zmm6, 352(%rsp) # 64-byte Spill | |
| vpunpckldq %xmm18, %xmm22, %xmm6 # xmm6 = xmm22[0],xmm18[0],xmm22[1],xmm18[1] | |
| vshufpd $128, %zmm9, %zmm27, %zmm23 {%k4} # zmm23 {%k4} = zmm27[0],zmm9[0],zmm27[2],zmm9[2],zmm27[4],zmm9[4],zmm27[6],zmm9[7] | |
| vmovdqu64 %zmm6, 2480(%rsp) # 64-byte Spill | |
| vpunpckhdq %xmm18, %xmm22, %xmm6 # xmm6 = xmm22[2],xmm18[2],xmm22[3],xmm18[3] | |
| vblendps $3, %xmm15, %xmm13, %xmm13 # xmm13 = xmm15[0,1],xmm13[2,3] | |
| vmovdqu64 %zmm6, 2032(%rsp) # 64-byte Spill | |
| vinsertps $76, %xmm22, %xmm18, %xmm6 # xmm6 = xmm22[1],xmm18[1],zero,zero | |
| vpermt2ps %zmm18, %zmm8, %zmm22 | |
| vmovddup .LCPI0_194(%rip), %xmm8 # xmm8 = [4,0,4,0] | |
| # xmm8 = mem[0,0] | |
| vunpckhpd %ymm28, %ymm16, %ymm18 # ymm18 = ymm16[1],ymm28[1],ymm16[3],ymm28[3] | |
| vmovups %zmm6, 1904(%rsp) # 64-byte Spill | |
| vmovaps %zmm4, %zmm6 | |
| vmovaps %xmm16, %xmm4 | |
| vpermt2ps %xmm28, %xmm8, %xmm4 | |
| vmovaps %zmm28, %zmm8 | |
| vpermt2ps %zmm16, %zmm6, %zmm8 | |
| vpunpckldq %zmm21, %zmm1, %zmm28 # zmm28 = zmm1[0],zmm21[0],zmm1[1],zmm21[1],zmm1[4],zmm21[4],zmm1[5],zmm21[5],zmm1[8],zmm21[8],zmm1[9],zmm21[9],zmm1[12],zmm21[12],zmm1[13],zmm21[13] | |
| vmovupd 2288(%rsp), %zmm16 # 64-byte Reload | |
| vextracti64x4 $1, %zmm28, %ymm14 | |
| vextractf64x4 $1, %zmm8, %ymm15 | |
| vpblendd $192, %ymm15, %ymm14, %ymm14 # ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] | |
| vpblendd $15, %ymm13, %ymm14, %ymm6 # ymm6 = ymm13[0,1,2,3],ymm14[4,5,6,7] | |
| vpermt2pd 2608(%rsp), %zmm16, %zmm26 # 64-byte Folded Reload | |
| vpermt2pd 2480(%rsp), %zmm16, %zmm11 # 64-byte Folded Reload | |
| vmovdqu %ymm6, 1424(%rsp) # 32-byte Spill | |
| vmovupd 1584(%rsp), %zmm6 # 64-byte Reload | |
| vextractf32x4 $2, %zmm6, %xmm13 | |
| vmovupd 1776(%rsp), %zmm6 # 64-byte Reload | |
| vmovapd %zmm11, %zmm26 {%k4} | |
| vunpckhps %xmm30, %xmm31, %xmm11 # xmm11 = xmm31[2],xmm30[2],xmm31[3],xmm30[3] | |
| vextractf32x4 $2, %zmm6, %xmm14 | |
| vmovupd 160(%rsp), %zmm6 # 64-byte Reload | |
| vblendpd $1, %xmm14, %xmm13, %xmm13 # xmm13 = xmm14[0],xmm13[1] | |
| vshuff64x2 $170, %zmm6, %zmm6, %zmm14 # zmm14 = zmm6[4,5,4,5,4,5,4,5] | |
| vmovupd 3184(%rsp), %zmm6 # 64-byte Reload | |
| vshuff64x2 $170, %zmm6, %zmm6, %zmm15 # zmm15 = zmm6[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $170, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[4,5,4,5,4,5,4,5] | |
| vextractf32x4 $2, %zmm24, %xmm6 | |
| vmovupd 816(%rsp), %zmm24 # 64-byte Reload | |
| vblendpd $8, %ymm15, %ymm14, %ymm14 # ymm14 = ymm14[0,1,2],ymm15[3] | |
| vblendpd $3, %ymm13, %ymm14, %ymm15 # ymm15 = ymm13[0,1],ymm14[2,3] | |
| vextractf32x4 $2, %zmm5, %xmm13 | |
| vmovups 3056(%rsp), %zmm5 # 64-byte Reload | |
| vshufpd $128, %zmm7, %zmm24, %zmm12 {%k4} # zmm12 {%k4} = zmm24[0],zmm7[0],zmm24[2],zmm7[2],zmm24[4],zmm7[4],zmm24[6],zmm7[7] | |
| vextractf32x4 $2, %zmm5, %xmm14 | |
| vmovupd 2736(%rsp), %zmm5 # 64-byte Reload | |
| vblendps $3, %xmm14, %xmm13, %xmm13 # xmm13 = xmm14[0,1],xmm13[2,3] | |
| vshuff64x2 $170, %zmm5, %zmm5, %zmm14 # zmm14 = zmm5[4,5,4,5,4,5,4,5] | |
| vunpcklps %xmm17, %xmm29, %xmm5 # xmm5 = xmm29[0],xmm17[0],xmm29[1],xmm17[1] | |
| vblendpd $8, %ymm0, %ymm14, %ymm14 # ymm14 = ymm14[0,1,2],ymm0[3] | |
| vmovupd 2800(%rsp), %ymm0 # 32-byte Reload | |
| vblendpd $3, %ymm13, %ymm14, %ymm13 # ymm13 = ymm13[0,1],ymm14[2,3] | |
| vextractf32x4 $2, %zmm19, %xmm14 | |
| vmovupd 416(%rsp), %zmm19 # 64-byte Reload | |
| vblendpd $3, 3120(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload | |
| # ymm0 = mem[0,1],ymm0[2,3] | |
| vblendps $3, %xmm14, %xmm10, %xmm10 # xmm10 = xmm14[0,1],xmm10[2,3] | |
| vmovapd .LCPI0_20(%rip), %zmm14 # zmm14 = [0,8,0,8,4,12,4,13] | |
| vshufpd $128, %zmm3, %zmm20, %zmm19 {%k4} # zmm19 {%k4} = zmm20[0],zmm3[0],zmm20[2],zmm3[2],zmm20[4],zmm3[4],zmm20[6],zmm3[7] | |
| vshuff64x2 $170, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[4,5,4,5,4,5,4,5] | |
| vpermt2pd %zmm3, %zmm14, %zmm20 | |
| vmovupd 2544(%rsp), %zmm3 # 64-byte Reload | |
| vpermt2pd %zmm9, %zmm14, %zmm27 | |
| vpermt2pd %zmm7, %zmm14, %zmm24 | |
| vmovups 784(%rsp), %ymm7 # 32-byte Reload | |
| vmovaps %ymm31, %ymm9 | |
| vshuff64x2 $170, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[4,5,4,5,4,5,4,5] | |
| vblendpd $8, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0,1,2],ymm3[3] | |
| vextractf32x4 $2, %zmm25, %xmm3 | |
| vmovupd 2928(%rsp), %zmm25 # 64-byte Reload | |
| vblendpd $3, %ymm10, %ymm2, %ymm2 # ymm2 = ymm10[0,1],ymm2[2,3] | |
| vmovupd 1840(%rsp), %zmm10 # 64-byte Reload | |
| vblendps $3, %xmm3, %xmm6, %xmm3 # xmm3 = xmm3[0,1],xmm6[2,3] | |
| vshuff64x2 $170, %zmm8, %zmm8, %zmm6 # zmm6 = zmm8[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $170, %zmm28, %zmm28, %zmm8 # zmm8 = zmm28[4,5,4,5,4,5,4,5] | |
| vblendpd $8, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0,1,2],ymm6[3] | |
| vinsertf128 $1, %xmm4, %ymm0, %ymm8 | |
| vblendpd $3, %ymm3, %ymm6, %ymm3 # ymm3 = ymm3[0,1],ymm6[2,3] | |
| vmovlhps %xmm31, %xmm30, %xmm6 # xmm6 = xmm30[0],xmm31[0] | |
| vshufps $36, %xmm6, %xmm5, %xmm5 # xmm5 = xmm5[0,1],xmm6[2,0] | |
| vpunpckldq %xmm21, %xmm1, %xmm6 # xmm6 = xmm1[0],xmm21[0],xmm1[1],xmm21[1] | |
| vinsertf128 $1, %xmm6, %ymm0, %ymm6 | |
| vblendpd $8, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0,1,2],ymm8[3] | |
| vshufpd $128, %zmm22, %zmm25, %zmm10 {%k4} # zmm10 {%k4} = zmm25[0],zmm22[0],zmm25[2],zmm22[2],zmm25[4],zmm22[4],zmm25[6],zmm22[7] | |
| vpermt2pd %zmm22, %zmm14, %zmm25 | |
| vbroadcastsd .LCPI0_30(%rip), %ymm14 # ymm14 = [5,13,5,13,5,13,5,13] | |
| vshufps $51, %xmm29, %xmm17, %xmm8 # xmm8 = xmm17[3,0],xmm29[3,0] | |
| vblendpd $3, %ymm5, %ymm6, %ymm5 # ymm5 = ymm5[0,1],ymm6[2,3] | |
| vunpckhps %ymm30, %ymm31, %ymm6 # ymm6 = ymm31[2],ymm30[2],ymm31[3],ymm30[3],ymm31[6],ymm30[6],ymm31[7],ymm30[7] | |
| vinsertf64x4 $0, %ymm0, %zmm10, %zmm0 | |
| vinsertps $179, %xmm30, %xmm31, %xmm10 # xmm10 = zero,zero,xmm31[2],xmm30[2] | |
| vinsertf64x4 $0, %ymm5, %zmm26, %zmm4 | |
| vunpckhps %ymm17, %ymm29, %ymm5 # ymm5 = ymm29[2],ymm17[2],ymm29[3],ymm17[3],ymm29[6],ymm17[6],ymm29[7],ymm17[7] | |
| vpermpd $170, %ymm6, %ymm6 # ymm6 = ymm6[2,2,2,2] | |
| vextractf128 $1, %ymm5, %xmm5 | |
| vmovupd %zmm4, 416(%rsp) # 64-byte Spill | |
| vblendps $3, %xmm5, %xmm6, %xmm5 # xmm5 = xmm5[0,1],xmm6[2,3] | |
| vpunpckhdq %ymm21, %ymm1, %ymm6 # ymm6 = ymm1[2],ymm21[2],ymm1[3],ymm21[3],ymm1[6],ymm21[6],ymm1[7],ymm21[7] | |
| vshufps $36, %ymm18, %ymm6, %ymm6 # ymm6 = ymm6[0,1],ymm18[2,0],ymm6[4,5],ymm18[6,4] | |
| vblendps $15, %ymm5, %ymm6, %ymm4 # ymm4 = ymm5[0,1,2,3],ymm6[4,5,6,7] | |
| vmovups 288(%rsp), %zmm5 # 64-byte Reload | |
| vunpckhps %xmm17, %xmm29, %xmm6 # xmm6 = xmm29[2],xmm17[2],xmm29[3],xmm17[3] | |
| vpermt2ps %ymm30, %ymm14, %ymm9 | |
| vmovaps .LCPI0_31(%rip), %ymm14 # ymm14 = [1,9,2,3,5,13,u,u] | |
| vinsertf64x4 $0, %ymm4, %zmm5, %zmm26 | |
| vunpcklps %ymm17, %ymm29, %ymm4 # ymm4 = ymm29[0],ymm17[0],ymm29[1],ymm17[1],ymm29[4],ymm17[4],ymm29[5],ymm17[5] | |
| vunpcklps %ymm30, %ymm31, %ymm5 # ymm5 = ymm31[0],ymm30[0],ymm31[1],ymm30[1],ymm31[4],ymm30[4],ymm31[5],ymm30[5] | |
| vextractf128 $1, %ymm4, %xmm4 | |
| vpermpd $170, %ymm5, %ymm5 # ymm5 = ymm5[2,2,2,2] | |
| vblendps $3, %xmm4, %xmm5, %xmm4 # xmm4 = xmm4[0,1],xmm5[2,3] | |
| vpunpckldq %ymm21, %ymm1, %ymm5 # ymm5 = ymm1[0],ymm21[0],ymm1[1],ymm21[1],ymm1[4],ymm21[4],ymm1[5],ymm21[5] | |
| vshufps $36, 2672(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload | |
| # ymm5 = ymm5[0,1],mem[2,0],ymm5[4,5],mem[6,4] | |
| vblendps $15, %ymm4, %ymm5, %ymm4 # ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] | |
| vmovups 560(%rsp), %zmm5 # 64-byte Reload | |
| vmovupd %zmm0, 560(%rsp) # 64-byte Spill | |
| vunpcklps %xmm30, %xmm31, %xmm0 # xmm0 = xmm31[0],xmm30[0],xmm31[1],xmm30[1] | |
| vmovups (%rdi,%r10), %zmm30 {%k2} {z} | |
| vinsertf64x4 $0, %ymm4, %zmm5, %zmm4 | |
| vmovaps %ymm29, %ymm5 | |
| vpermt2ps %ymm17, %ymm7, %ymm5 | |
| vinsertf64x4 $0, 1712(%rsp), %zmm23, %zmm7 # 32-byte Folded Reload | |
| .loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35 | |
| vbroadcastss %xmm30, %zmm31 | |
| vmovups %zmm4, 288(%rsp) # 64-byte Spill | |
| vinsertf64x4 $0, 880(%rsp), %zmm19, %zmm4 # 32-byte Folded Reload | |
| .loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
| vmovups (%rdi,%rax), %zmm19 {%k2} {z} | |
| movq -32(%rsp), %rax # 8-byte Reload | |
| vextractf128 $1, %ymm5, %xmm5 | |
| vblendps $3, %xmm5, %xmm9, %xmm5 # xmm5 = xmm5[0,1],xmm9[2,3] | |
| vmovdqa %ymm1, %ymm9 | |
| vpermt2ps %ymm21, %ymm14, %ymm9 | |
| vblendps $192, 2864(%rsp), %ymm9, %ymm9 # 32-byte Folded Reload | |
| # ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] | |
| vmovupd %zmm7, 32(%rsp) # 64-byte Spill | |
| vmovups 1328(%rsp), %zmm7 # 64-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vbroadcastss %xmm19, %zmm23 | |
| vmovupd %zmm4, 160(%rsp) # 64-byte Spill | |
| vinsertps $76, %xmm29, %xmm17, %xmm4 # xmm4 = xmm29[1],xmm17[1],zero,zero | |
| .loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
| vmovups (%rdi,%rax), %zmm29 {%k2} {z} | |
| vblendps $15, %ymm5, %ymm9, %ymm5 # ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] | |
| vinsertf128 $1, 2992(%rsp), %ymm0, %ymm9 # 16-byte Folded Reload | |
| vinsertf64x4 $0, %ymm5, %zmm7, %zmm28 | |
| vmovupd 1136(%rsp), %zmm5 # 64-byte Reload | |
| vmovupd 1968(%rsp), %zmm7 # 64-byte Reload | |
| vpermt2pd 1648(%rsp), %zmm16, %zmm5 # 64-byte Folded Reload | |
| vpermt2pd 2032(%rsp), %zmm16, %zmm7 # 64-byte Folded Reload | |
| vmovapd %zmm7, %zmm5 {%k4} | |
| vmovapd %zmm5, %zmm7 | |
| vblendps $3, %xmm6, %xmm10, %xmm5 # xmm5 = xmm6[0,1],xmm10[2,3] | |
| vinsertf64x4 $0, 1424(%rsp), %zmm12, %zmm6 # 32-byte Folded Reload | |
| vmovups (%rdi,%rbp), %zmm10 {%k2} {z} | |
| vmovupd %zmm6, 224(%rsp) # 64-byte Spill | |
| vpunpckhdq %xmm21, %xmm1, %xmm6 # xmm6 = xmm1[2],xmm21[2],xmm1[3],xmm21[3] | |
| vinsertf128 $1, %xmm6, %ymm0, %ymm6 | |
| vblendps $192, %ymm9, %ymm6, %ymm6 # ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] | |
| vmovupd 96(%rsp), %zmm9 # 64-byte Reload | |
| vblendps $15, %ymm5, %ymm6, %ymm5 # ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] | |
| vmovdqa %xmm1, %xmm6 | |
| vinsertf64x4 $0, %ymm5, %zmm7, %zmm17 | |
| vinsertps $76, %xmm1, %xmm21, %xmm7 # xmm7 = xmm1[1],xmm21[1],zero,zero | |
| vmovsd .LCPI0_195(%rip), %xmm1 # xmm1 = [3,7,0,0] | |
| vshufps $226, %xmm11, %xmm8, %xmm5 # xmm5 = xmm8[2,0],xmm11[2,3] | |
| vinsertf128 $1, 1520(%rsp), %ymm0, %ymm8 # 16-byte Folded Reload | |
| vmovapd %zmm20, %zmm9 {%k4} | |
| vinsertf64x4 $0, %ymm15, %zmm9, %zmm22 | |
| vmovups (%rdi,%r13), %zmm15 {%k2} {z} | |
| vmovups (%rdi,%r11), %zmm9 {%k2} {z} | |
| vpermt2ps %xmm21, %xmm1, %xmm6 | |
| vmovups 688(%rsp), %zmm1 # 64-byte Reload | |
| vinsertf128 $1, %xmm6, %ymm0, %ymm6 | |
| vblendps $3, %xmm4, %xmm0, %xmm0 # xmm0 = xmm4[0,1],xmm0[2,3] | |
| vinsertf128 $1, %xmm7, %ymm0, %ymm4 | |
| vblendps $192, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] | |
| vmovups (%rdi,%rdx), %zmm8 {%k2} {z} | |
| vblendps $15, %ymm5, %ymm6, %ymm5 # ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] | |
| vmovupd 1008(%rsp), %zmm6 # 64-byte Reload | |
| vmovups %zmm15, 96(%rsp) # 64-byte Spill | |
| vpermt2pd 1904(%rsp), %zmm16, %zmm6 # 64-byte Folded Reload | |
| vinsertf64x4 $0, %ymm5, %zmm1, %zmm12 | |
| vmovupd 1200(%rsp), %zmm5 # 64-byte Reload | |
| vmovupd 624(%rsp), %zmm1 # 64-byte Reload | |
| vpermt2pd 944(%rsp), %zmm16, %zmm5 # 64-byte Folded Reload | |
| vmovups (%rdi,%r15), %zmm16 {%k2} {z} | |
| vmovapd %zmm25, %zmm1 {%k4} | |
| vinsertf64x4 $0, %ymm13, %zmm1, %zmm14 | |
| vmovups (%rdi,%r14), %zmm1 {%k2} {z} | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vbroadcastss %xmm16, %zmm25 | |
| vmovapd %zmm6, %zmm5 {%k4} | |
| vmovapd %zmm5, %zmm6 | |
| vinsertf128 $1, 1456(%rsp), %ymm0, %ymm5 # 16-byte Folded Reload | |
| vblendps $192, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] | |
| vblendps $15, %ymm0, %ymm4, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] | |
| .loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
| vmovups (%rdi,%rcx), %zmm4 {%k2} {z} | |
| vinsertf64x4 $0, %ymm0, %zmm6, %zmm21 | |
| vmovupd 1264(%rsp), %zmm0 # 64-byte Reload | |
| vmovups (%rdi,%r8), %zmm6 {%k2} {z} | |
| vmovups %zmm1, 1264(%rsp) # 64-byte Spill | |
| vmovaps %zmm4, %zmm7 | |
| vmovapd %zmm27, %zmm0 {%k4} | |
| vinsertf64x4 $0, %ymm2, %zmm0, %zmm18 | |
| .loc 1 222 51 # 03-matrix-multiplication-cpu.py:222:51 | |
| vpbroadcastd %r9d, %zmm0 | |
| vmovaps %zmm6, %zmm27 | |
| .loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
| addl $-16, %r9d | |
| vmovdqu64 %zmm0, 1200(%rsp) # 64-byte Spill | |
| .loc 1 222 51 # 03-matrix-multiplication-cpu.py:222:51 | |
| vpcmpgtd .LCPI0_193(%rip), %zmm0, %k1 | |
| vmovupd 352(%rsp), %zmm13 # 64-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vunpcklps %xmm27, %xmm30, %xmm0 # xmm0 = xmm30[0],xmm27[0],xmm30[1],xmm27[1] | |
| vmovups %zmm4, 1904(%rsp) # 64-byte Spill | |
| vmovaps %zmm8, %zmm4 | |
| movq -96(%rsp), %rax # 8-byte Reload | |
| vmovups %zmm16, 1648(%rsp) # 64-byte Spill | |
| vmovups %zmm10, 1968(%rsp) # 64-byte Spill | |
| vmovups %zmm6, 1776(%rsp) # 64-byte Spill | |
| vmovups %zmm30, 1840(%rsp) # 64-byte Spill | |
| vmovups %zmm4, 624(%rsp) # 64-byte Spill | |
| vmovups %zmm9, 688(%rsp) # 64-byte Spill | |
| vmovups %zmm29, 1712(%rsp) # 64-byte Spill | |
| vmovlhps %xmm10, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm10[0] | |
| vinsertps $48, %xmm7, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm7[0] | |
| vinsertf128 $1, %xmm15, %ymm0, %ymm2 | |
| vinsertf128 $1, %xmm1, %ymm0, %ymm0 | |
| vmovapd %zmm24, %zmm13 {%k4} | |
| vinsertf64x4 $0, %ymm3, %zmm13, %zmm5 | |
| vbroadcastss %xmm9, %ymm3 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovups (%rsi,%rax), %zmm11 {%k1} {z} | |
| movq 512(%rsp), %rax # 8-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd213ps 416(%rsp), %zmm11, %zmm31 # 64-byte Folded Reload | |
| # zmm31 = (zmm11 * zmm31) + mem | |
| vblendps $32, %ymm3, %ymm0, %ymm0 # ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] | |
| vshufpd $2, %ymm2, %ymm0, %ymm2 # ymm2 = ymm0[0],ymm2[1],ymm0[2],ymm2[2] | |
| vbroadcastss %xmm4, %ymm0 | |
| vblendps $128, %ymm0, %ymm2, %ymm0 # ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] | |
| .loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
| vmovups (%rdi,%rax), %zmm3 {%k2} {z} | |
| movq 520(%rsp), %rax # 8-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vinsertf64x4 $1, %ymm16, %zmm0, %zmm13 | |
| vmovaps .LCPI0_109(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,16,u,u,u,u,u,u] | |
| vfmadd213ps %zmm5, %zmm11, %zmm25 # zmm25 = (zmm11 * zmm25) + zmm5 | |
| vbroadcastss %xmm4, %zmm5 | |
| vfmadd213ps %zmm18, %zmm11, %zmm23 # zmm23 = (zmm11 * zmm23) + zmm18 | |
| vfmadd213ps 1072(%rsp), %zmm11, %zmm5 # 64-byte Folded Reload | |
| # zmm5 = (zmm11 * zmm5) + mem | |
| vmovups %zmm3, 1136(%rsp) # 64-byte Spill | |
| vpermt2ps %zmm19, %zmm0, %zmm13 | |
| vmovshdup %xmm13, %xmm0 # xmm0 = xmm13[1,1,3,3] | |
| vbroadcastsd %xmm0, %zmm0 | |
| vfmadd213ps %zmm21, %zmm11, %zmm0 # zmm0 = (zmm11 * zmm0) + zmm21 | |
| vmovups %zmm0, 352(%rsp) # 64-byte Spill | |
| vshufps $255, %xmm13, %xmm13, %xmm0 # xmm0 = xmm13[3,3,3,3] | |
| vbroadcastsd %xmm0, %zmm20 | |
| vshufps $170, %xmm13, %xmm13, %xmm0 # xmm0 = xmm13[2,2,2,2] | |
| vfmadd213ps %zmm12, %zmm11, %zmm20 # zmm20 = (zmm11 * zmm20) + zmm12 | |
| vbroadcastsd %xmm0, %zmm12 | |
| vshufps $170, %ymm2, %ymm2, %ymm0 # ymm0 = ymm2[2,2,2,2,6,6,6,6] | |
| vshuff64x2 $85, %zmm0, %zmm0, %zmm15 # zmm15 = zmm0[2,3,2,3,2,3,2,3] | |
| vshufps $85, %ymm13, %ymm13, %ymm0 # ymm0 = ymm13[1,1,1,1,5,5,5,5] | |
| vshuff64x2 $85, %zmm0, %zmm0, %zmm16 # zmm16 = zmm0[2,3,2,3,2,3,2,3] | |
| vextractf128 $1, %ymm13, %xmm0 | |
| vfmadd213ps %zmm17, %zmm11, %zmm12 # zmm12 = (zmm11 * zmm12) + zmm17 | |
| vmovaps %zmm30, %zmm17 | |
| vbroadcastss %xmm3, %zmm13 | |
| vmovaps %zmm3, %zmm30 | |
| vfmadd213ps %zmm26, %zmm11, %zmm15 # zmm15 = (zmm11 * zmm15) + zmm26 | |
| vbroadcastss %xmm29, %zmm26 | |
| vfmadd213ps %zmm28, %zmm11, %zmm16 # zmm16 = (zmm11 * zmm16) + zmm28 | |
| .loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
| vmovups (%rdi,%rax), %zmm28 {%k2} {z} | |
| movq -80(%rsp), %rax # 8-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vbroadcastss %xmm0, %zmm8 | |
| vinsertps $76, %xmm17, %xmm27, %xmm0 # xmm0 = xmm17[1],xmm27[1],zero,zero | |
| vfmadd213ps 288(%rsp), %zmm11, %zmm8 # 64-byte Folded Reload | |
| # zmm8 = (zmm11 * zmm8) + mem | |
| vfmadd213ps %zmm22, %zmm11, %zmm13 # zmm13 = (zmm11 * zmm13) + zmm22 | |
| vshufps $212, %xmm10, %xmm0, %xmm0 # xmm0 = xmm0[0,1],xmm10[1,3] | |
| vmovups 1264(%rsp), %zmm10 # 64-byte Reload | |
| vfmadd213ps %zmm14, %zmm11, %zmm26 # zmm26 = (zmm11 * zmm26) + zmm14 | |
| vinsertps $112, %xmm7, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm7[1] | |
| vmovups 96(%rsp), %zmm7 # 64-byte Reload | |
| .loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
| vmovups (%rdi,%rax), %zmm24 {%k2} {z} | |
| movq -88(%rsp), %rax # 8-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vbroadcastss %xmm28, %zmm1 | |
| vfmadd213ps 224(%rsp), %zmm11, %zmm1 # 64-byte Folded Reload | |
| # zmm1 = (zmm11 * zmm1) + mem | |
| vmovups %zmm28, 1328(%rsp) # 64-byte Spill | |
| vmovshdup %xmm10, %xmm2 # xmm2 = xmm10[1,1,3,3] | |
| .loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
| vmovups (%rdi,%rax), %zmm21 {%k2} {z} | |
| movq -72(%rsp), %rax # 8-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vinsertf128 $1, %xmm2, %ymm0, %ymm2 | |
| vblendps $240, %ymm2, %ymm0, %ymm2 # ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] | |
| vinsertf128 $1, %xmm9, %ymm0, %ymm0 | |
| vbroadcastss %xmm24, %zmm14 | |
| vfmadd213ps 32(%rsp), %zmm11, %zmm14 # 64-byte Folded Reload | |
| # zmm14 = (zmm11 * zmm14) + mem | |
| vmovups %zmm24, 288(%rsp) # 64-byte Spill | |
| vblendps $34, %ymm0, %ymm2, %ymm0 # ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] | |
| vshufps $85, %xmm7, %xmm7, %xmm2 # xmm2 = xmm7[1,1,1,1] | |
| .loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
| vmovups (%rdi,%rax), %zmm6 {%k2} {z} | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vinsertf128 $1, %xmm2, %ymm0, %ymm2 | |
| movq 528(%rsp), %rax # 8-byte Reload | |
| .loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
| addq $64, %rdi | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vbroadcastss %xmm21, %zmm18 | |
| vfmadd213ps 560(%rsp), %zmm11, %zmm18 # 64-byte Folded Reload | |
| # zmm18 = (zmm11 * zmm18) + mem | |
| vmovups %zmm21, 416(%rsp) # 64-byte Spill | |
| vbroadcastss %xmm6, %zmm3 | |
| vfmadd213ps 160(%rsp), %zmm11, %zmm3 # 64-byte Folded Reload | |
| # zmm3 = (zmm11 * zmm3) + mem | |
| vblendps $192, %ymm2, %ymm0, %ymm11 # ymm11 = ymm0[0,1,2,3,4,5],ymm2[6,7] | |
| vmovdqu64 1200(%rsp), %zmm0 # 64-byte Reload | |
| vmovups %zmm6, 560(%rsp) # 64-byte Spill | |
| .loc 1 222 51 # 03-matrix-multiplication-cpu.py:222:51 | |
| vpcmpgtd .LCPI0_108(%rip){1to16}, %zmm0, %k1 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vbroadcastsd %xmm4, %ymm0 | |
| vmovups 1648(%rsp), %zmm4 # 64-byte Reload | |
| vblendps $128, %ymm0, %ymm11, %ymm0 # ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7] | |
| vshufps $255, %ymm0, %ymm0, %ymm2 # ymm2 = ymm0[3,3,3,3,7,7,7,7] | |
| vshuff64x2 $85, %zmm2, %zmm2, %zmm27 # zmm27 = zmm2[2,3,2,3,2,3,2,3] | |
| vmovaps %zmm19, %zmm2 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovups (%rsi,%rax), %zmm22 {%k1} {z} | |
| movq -64(%rsp), %rax # 8-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd213ps %zmm5, %zmm22, %zmm27 # zmm27 = (zmm22 * zmm27) + zmm5 | |
| vshufps $170, %ymm11, %ymm11, %ymm5 # ymm5 = ymm11[2,2,2,2,6,6,6,6] | |
| vmovaps .LCPI0_110(%rip), %zmm11 # zmm11 = [0,1,2,3,4,5,6,7,17,u,u,u,u,u,u,u] | |
| vshuff64x2 $85, %zmm5, %zmm5, %zmm5 # zmm5 = zmm5[2,3,2,3,2,3,2,3] | |
| vfmadd213ps %zmm15, %zmm22, %zmm5 # zmm5 = (zmm22 * zmm5) + zmm15 | |
| vmovaps .LCPI0_112(%rip), %zmm15 # zmm15 = [0,1,2,3,4,5,6,7,8,9,17,u,u,u,u,u] | |
| vpermt2ps %zmm4, %zmm11, %zmm0 | |
| vmovaps .LCPI0_111(%rip), %zmm11 # zmm11 = [0,1,2,3,4,5,6,7,8,17,u,u,u,u,u,u] | |
| vpermt2ps %zmm19, %zmm11, %zmm0 | |
| vmovaps %zmm0, %zmm11 | |
| vpermt2ps %zmm29, %zmm15, %zmm11 | |
| vextractf128 $1, %ymm11, %xmm11 | |
| vbroadcastss %xmm11, %zmm11 | |
| vfmadd213ps %zmm8, %zmm22, %zmm11 # zmm11 = (zmm22 * zmm11) + zmm8 | |
| vshufps $85, %ymm0, %ymm0, %ymm8 # ymm8 = ymm0[1,1,1,1,5,5,5,5] | |
| vshuff64x2 $85, %zmm8, %zmm8, %zmm15 # zmm15 = zmm8[2,3,2,3,2,3,2,3] | |
| vshufps $170, %xmm0, %xmm0, %xmm8 # xmm8 = xmm0[2,2,2,2] | |
| vbroadcastsd %xmm8, %zmm17 | |
| vshufps $255, %xmm0, %xmm0, %xmm8 # xmm8 = xmm0[3,3,3,3] | |
| vmovshdup %xmm0, %xmm0 # xmm0 = xmm0[1,1,3,3] | |
| vbroadcastsd %xmm0, %zmm9 | |
| vmovshdup %xmm6, %xmm0 # xmm0 = xmm6[1,1,3,3] | |
| vbroadcastsd %xmm8, %zmm19 | |
| vfmadd213ps %zmm16, %zmm22, %zmm15 # zmm15 = (zmm22 * zmm15) + zmm16 | |
| vmovaps %zmm2, %zmm6 | |
| vfmadd213ps 352(%rsp), %zmm22, %zmm9 # 64-byte Folded Reload | |
| # zmm9 = (zmm22 * zmm9) + mem | |
| vmovups %zmm6, 1584(%rsp) # 64-byte Spill | |
| vbroadcastsd %xmm0, %zmm0 | |
| vfmadd213ps %zmm12, %zmm22, %zmm17 # zmm17 = (zmm22 * zmm17) + zmm12 | |
| vfmadd213ps %zmm20, %zmm22, %zmm19 # zmm19 = (zmm22 * zmm19) + zmm20 | |
| vfmadd213ps %zmm3, %zmm22, %zmm0 # zmm0 = (zmm22 * zmm0) + zmm3 | |
| vmovshdup %xmm21, %xmm3 # xmm3 = xmm21[1,1,3,3] | |
| vbroadcastsd %xmm3, %zmm8 | |
| vmovshdup %xmm24, %xmm3 # xmm3 = xmm24[1,1,3,3] | |
| vmovups 1776(%rsp), %zmm24 # 64-byte Reload | |
| vbroadcastsd %xmm3, %zmm12 | |
| vmovshdup %xmm28, %xmm3 # xmm3 = xmm28[1,1,3,3] | |
| vmovaps %zmm4, %zmm28 | |
| vpermpd $85, %ymm10, %ymm4 # ymm4 = ymm10[1,1,1,1] | |
| vfmadd213ps %zmm18, %zmm22, %zmm8 # zmm8 = (zmm22 * zmm8) + zmm18 | |
| vfmadd213ps %zmm14, %zmm22, %zmm12 # zmm12 = (zmm22 * zmm12) + zmm14 | |
| vbroadcastsd %xmm3, %zmm14 | |
| vmovups 1840(%rsp), %zmm3 # 64-byte Reload | |
| vfmadd213ps %zmm1, %zmm22, %zmm14 # zmm14 = (zmm22 * zmm14) + zmm1 | |
| vmovshdup %xmm30, %xmm1 # xmm1 = xmm30[1,1,3,3] | |
| vmovaps %zmm29, %zmm30 | |
| vbroadcastsd %xmm1, %zmm16 | |
| vmovshdup %xmm29, %xmm1 # xmm1 = xmm29[1,1,3,3] | |
| vmovaps %zmm28, %zmm29 | |
| vbroadcastsd %xmm1, %zmm18 | |
| vmovshdup %xmm2, %xmm1 # xmm1 = xmm2[1,1,3,3] | |
| vmovups 1904(%rsp), %zmm2 # 64-byte Reload | |
| vfmadd213ps %zmm13, %zmm22, %zmm16 # zmm16 = (zmm22 * zmm16) + zmm13 | |
| vbroadcastsd %xmm1, %zmm13 | |
| vmovshdup %xmm28, %xmm1 # xmm1 = xmm28[1,1,3,3] | |
| vbroadcastsd %xmm1, %zmm20 | |
| vfmadd213ps %zmm26, %zmm22, %zmm18 # zmm18 = (zmm22 * zmm18) + zmm26 | |
| vmovdqu64 1200(%rsp), %zmm26 # 64-byte Reload | |
| vfmadd213ps %zmm23, %zmm22, %zmm13 # zmm13 = (zmm22 * zmm13) + zmm23 | |
| .loc 1 222 51 # 03-matrix-multiplication-cpu.py:222:51 | |
| vpcmpgtd .LCPI0_9(%rip){1to16}, %zmm26, %k1 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vmovshdup %xmm3, %xmm1 # xmm1 = xmm3[1,1,3,3] | |
| vfmadd213ps %zmm25, %zmm22, %zmm20 # zmm20 = (zmm22 * zmm20) + zmm25 | |
| vmovaps %zmm3, %zmm25 | |
| vbroadcastss %xmm1, %zmm21 | |
| vunpckhps %xmm24, %xmm3, %xmm1 # xmm1 = xmm3[2],xmm24[2],xmm3[3],xmm24[3] | |
| vblendps $12, 1968(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload | |
| # xmm1 = xmm1[0,1],mem[2,3] | |
| vpermilps $170, 688(%rsp), %xmm3 # 16-byte Folded Reload | |
| # xmm3 = mem[2,2,2,2] | |
| vfmadd213ps %zmm31, %zmm22, %zmm21 # zmm21 = (zmm22 * zmm21) + zmm31 | |
| vmovaps %zmm6, %zmm31 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovups (%rsi,%rax), %zmm22 {%k1} {z} | |
| .loc 1 222 51 is_stmt 0 # 03-matrix-multiplication-cpu.py:222:51 | |
| vpcmpgtd .LCPI0_107(%rip){1to16}, %zmm26, %k1 | |
| movq 536(%rsp), %rax # 8-byte Reload | |
| .loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35 | |
| vinsertps $176, %xmm2, %xmm1, %xmm1 # xmm1 = xmm1[0,1,2],xmm2[2] | |
| vinsertf128 $1, %xmm3, %ymm0, %ymm3 | |
| vblendps $240, %ymm4, %ymm1, %ymm4 # ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] | |
| vblendps $32, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] | |
| vinsertf128 $1, %xmm7, %ymm1, %ymm4 | |
| vblendps $204, %ymm4, %ymm3, %ymm1 # ymm1 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] | |
| vmovsldup 624(%rsp), %xmm3 # 16-byte Folded Reload | |
| # xmm3 = mem[0,0,2,2] | |
| vinsertf128 $1, %xmm3, %ymm0, %ymm3 | |
| vblendps $128, %ymm3, %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] | |
| vmovapd .LCPI0_113(%rip), %zmm3 # zmm3 = [0,1,2,3,9,u,u,u] | |
| vmovaps %zmm1, %zmm23 | |
| vshufps $255, %ymm1, %ymm1, %ymm1 # ymm1 = ymm1[3,3,3,3,7,7,7,7] | |
| vpermt2pd %zmm28, %zmm3, %zmm23 | |
| vmovaps .LCPI0_114(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,18,u,u,u,u,u,u] | |
| vpermt2ps %zmm6, %zmm3, %zmm23 | |
| vmovshdup %xmm23, %xmm3 # xmm3 = xmm23[1,1,3,3] | |
| vshufps $170, %xmm23, %xmm23, %xmm7 # xmm7 = xmm23[2,2,2,2] | |
| vshufps $255, %xmm23, %xmm23, %xmm6 # xmm6 = xmm23[3,3,3,3] | |
| vbroadcastsd %xmm3, %zmm28 | |
| vbroadcastsd %xmm7, %zmm7 | |
| vbroadcastsd %xmm6, %zmm6 | |
| vfmadd213ps %zmm9, %zmm22, %zmm28 # zmm28 = (zmm22 * zmm28) + zmm9 | |
| vshufps $85, %ymm23, %ymm23, %ymm9 # ymm9 = ymm23[1,1,1,1,5,5,5,5] | |
| vfmadd213ps %zmm17, %zmm22, %zmm7 # zmm7 = (zmm22 * zmm7) + zmm17 | |
| vshuff64x2 $85, %zmm1, %zmm1, %zmm17 # zmm17 = zmm1[2,3,2,3,2,3,2,3] | |
| vshufps $170, %ymm4, %ymm4, %ymm1 # ymm1 = ymm4[2,2,2,2,6,6,6,6] | |
| vfmadd213ps %zmm19, %zmm22, %zmm6 # zmm6 = (zmm22 * zmm6) + zmm19 | |
| vmovups 1968(%rsp), %zmm4 # 64-byte Reload | |
| vshuff64x2 $85, %zmm9, %zmm9, %zmm10 # zmm10 = zmm9[2,3,2,3,2,3,2,3] | |
| vextractf32x4 $1, %ymm23, %xmm9 | |
| vshuff64x2 $85, %zmm1, %zmm1, %zmm19 # zmm19 = zmm1[2,3,2,3,2,3,2,3] | |
| vfmadd213ps %zmm27, %zmm22, %zmm17 # zmm17 = (zmm22 * zmm17) + zmm27 | |
| vmovups 1328(%rsp), %zmm27 # 64-byte Reload | |
| vbroadcastss %xmm9, %zmm3 | |
| vfmadd213ps %zmm15, %zmm22, %zmm10 # zmm10 = (zmm22 * zmm10) + zmm15 | |
| vfmadd213ps %zmm5, %zmm22, %zmm19 # zmm19 = (zmm22 * zmm19) + zmm5 | |
| vmovups 1264(%rsp), %zmm5 # 64-byte Reload | |
| vfmadd213ps %zmm11, %zmm22, %zmm3 # zmm3 = (zmm22 * zmm3) + zmm11 | |
| vmovups %zmm3, 32(%rsp) # 64-byte Spill | |
| vmovaps %zmm25, %zmm3 | |
| vshufpd $1, %xmm3, %xmm3, %xmm1 # xmm1 = xmm3[1,0] | |
| vmovups 560(%rsp), %zmm25 # 64-byte Reload | |
| vbroadcastss %xmm1, %zmm23 | |
| vshufps $170, %xmm29, %xmm29, %xmm1 # xmm1 = xmm29[2,2,2,2] | |
| vbroadcastsd %xmm1, %zmm9 | |
| vshufps $170, %xmm31, %xmm31, %xmm1 # xmm1 = xmm31[2,2,2,2] | |
| vmovups 416(%rsp), %zmm31 # 64-byte Reload | |
| vfmadd213ps %zmm21, %zmm22, %zmm23 # zmm23 = (zmm22 * zmm23) + zmm21 | |
| vmovaps %zmm29, %zmm21 | |
| vmovups 1136(%rsp), %zmm29 # 64-byte Reload | |
| vbroadcastsd %xmm1, %zmm11 | |
| vshufps $170, %xmm30, %xmm30, %xmm1 # xmm1 = xmm30[2,2,2,2] | |
| vmovups 288(%rsp), %zmm30 # 64-byte Reload | |
| vfmadd213ps %zmm20, %zmm22, %zmm9 # zmm9 = (zmm22 * zmm9) + zmm20 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovups (%rsi,%rax), %zmm20 {%k1} {z} | |
| .loc 1 222 51 is_stmt 0 # 03-matrix-multiplication-cpu.py:222:51 | |
| vpcmpgtd .LCPI0_8(%rip){1to16}, %zmm26, %k1 | |
| movq -56(%rsp), %rax # 8-byte Reload | |
| .loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd213ps %zmm13, %zmm22, %zmm11 # zmm11 = (zmm22 * zmm11) + zmm13 | |
| vbroadcastsd %xmm1, %zmm13 | |
| vfmadd213ps %zmm18, %zmm22, %zmm13 # zmm13 = (zmm22 * zmm13) + zmm18 | |
| vshufps $170, %xmm29, %xmm29, %xmm1 # xmm1 = xmm29[2,2,2,2] | |
| vbroadcastsd %xmm1, %zmm15 | |
| vshufps $170, %xmm27, %xmm27, %xmm1 # xmm1 = xmm27[2,2,2,2] | |
| vfmadd213ps %zmm16, %zmm22, %zmm15 # zmm15 = (zmm22 * zmm15) + zmm16 | |
| vbroadcastsd %xmm1, %zmm16 | |
| vshufps $170, %xmm30, %xmm30, %xmm1 # xmm1 = xmm30[2,2,2,2] | |
| vfmadd213ps %zmm14, %zmm22, %zmm16 # zmm16 = (zmm22 * zmm16) + zmm14 | |
| vbroadcastsd %xmm1, %zmm14 | |
| vshufps $170, %xmm31, %xmm31, %xmm1 # xmm1 = xmm31[2,2,2,2] | |
| vfmadd213ps %zmm12, %zmm22, %zmm14 # zmm14 = (zmm22 * zmm14) + zmm12 | |
| vbroadcastsd %xmm1, %zmm12 | |
| vshufps $170, %xmm25, %xmm25, %xmm1 # xmm1 = xmm25[2,2,2,2] | |
| vfmadd213ps %zmm8, %zmm22, %zmm12 # zmm12 = (zmm22 * zmm12) + zmm8 | |
| vbroadcastsd %xmm1, %zmm8 | |
| vshufps $255, %xmm5, %xmm5, %xmm1 # xmm1 = xmm5[3,3,3,3] | |
| vfmadd213ps %zmm0, %zmm22, %zmm8 # zmm8 = (zmm22 * zmm8) + zmm0 | |
| vshufps $51, %xmm3, %xmm24, %xmm0 # xmm0 = xmm24[3,0],xmm3[3,0] | |
| vmovups 688(%rsp), %zmm3 # 64-byte Reload | |
| vshufps $242, %xmm4, %xmm0, %xmm0 # xmm0 = xmm0[2,0],xmm4[3,3] | |
| vblendps $8, %xmm2, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm2[3] | |
| vinsertf128 $1, %xmm1, %ymm0, %ymm1 | |
| vblendps $240, %ymm1, %ymm0, %ymm1 # ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] | |
| vpermpd $85, %ymm3, %ymm2 # ymm2 = ymm3[1,1,1,1] | |
| vblendps $32, %ymm2, %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] | |
| vmovshdup 96(%rsp), %xmm2 # 16-byte Folded Reload | |
| # xmm2 = mem[1,1,3,3] | |
| vinsertf128 $1, %xmm2, %ymm0, %ymm2 | |
| vblendps $192, %ymm2, %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] | |
| vshufps $170, %ymm1, %ymm1, %ymm2 # ymm2 = ymm1[2,2,2,2,6,6,6,6] | |
| vshuff64x2 $85, %zmm2, %zmm2, %zmm24 # zmm24 = zmm2[2,3,2,3,2,3,2,3] | |
| vmovups 624(%rsp), %zmm2 # 64-byte Reload | |
| vfmadd213ps %zmm19, %zmm20, %zmm24 # zmm24 = (zmm20 * zmm24) + zmm19 | |
| vinsertf128 $1, %xmm2, %ymm0, %ymm0 | |
| vblendps $136, %ymm0, %ymm1, %ymm1 # ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] | |
| vshufps $255, %ymm0, %ymm0, %ymm0 # ymm0 = ymm0[3,3,3,3,7,7,7,7] | |
| vshuff64x2 $85, %zmm0, %zmm0, %zmm18 # zmm18 = zmm0[2,3,2,3,2,3,2,3] | |
| vmovaps .LCPI0_115(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,19,u,u,u,u,u,u,u] | |
| vfmadd213ps %zmm17, %zmm20, %zmm18 # zmm18 = (zmm20 * zmm18) + zmm17 | |
| vmovups 1584(%rsp), %zmm17 # 64-byte Reload | |
| vpermt2ps %zmm21, %zmm0, %zmm1 | |
| vmovaps .LCPI0_116(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,19,u,u,u,u,u,u] | |
| vpermt2ps %zmm17, %zmm0, %zmm1 | |
| vshufps $85, %ymm1, %ymm1, %ymm0 # ymm0 = ymm1[1,1,1,1,5,5,5,5] | |
| vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
| vfmadd213ps %zmm10, %zmm20, %zmm0 # zmm0 = (zmm20 * zmm0) + zmm10 | |
| vmovups %zmm0, 160(%rsp) # 64-byte Spill | |
| vshufps $170, %xmm1, %xmm1, %xmm0 # xmm0 = xmm1[2,2,2,2] | |
| vbroadcastsd %xmm0, %zmm22 | |
| vshufps $255, %xmm1, %xmm1, %xmm0 # xmm0 = xmm1[3,3,3,3] | |
| vfmadd213ps %zmm7, %zmm20, %zmm22 # zmm22 = (zmm20 * zmm22) + zmm7 | |
| vbroadcastsd %xmm0, %zmm7 | |
| vmovshdup %xmm1, %xmm0 # xmm0 = xmm1[1,1,3,3] | |
| vbroadcastsd %xmm0, %zmm10 | |
| vshufps $255, %xmm25, %xmm25, %xmm0 # xmm0 = xmm25[3,3,3,3] | |
| vmovaps %zmm29, %zmm25 | |
| vmovups 1840(%rsp), %zmm29 # 64-byte Reload | |
| vbroadcastsd %xmm0, %zmm0 | |
| vfmadd213ps %zmm6, %zmm20, %zmm7 # zmm7 = (zmm20 * zmm7) + zmm6 | |
| vmovups 1904(%rsp), %zmm6 # 64-byte Reload | |
| vfmadd213ps %zmm28, %zmm20, %zmm10 # zmm10 = (zmm20 * zmm10) + zmm28 | |
| vfmadd213ps %zmm8, %zmm20, %zmm0 # zmm0 = (zmm20 * zmm0) + zmm8 | |
| vmovups %zmm0, 224(%rsp) # 64-byte Spill | |
| vshufps $255, %xmm31, %xmm31, %xmm0 # xmm0 = xmm31[3,3,3,3] | |
| vmovaps %zmm21, %zmm31 | |
| vbroadcastsd %xmm0, %zmm19 | |
| vshufps $255, %xmm30, %xmm30, %xmm0 # xmm0 = xmm30[3,3,3,3] | |
| vmovaps %zmm17, %zmm30 | |
| vbroadcastsd %xmm0, %zmm8 | |
| vshufps $255, %xmm27, %xmm27, %xmm0 # xmm0 = xmm27[3,3,3,3] | |
| vmovups 1712(%rsp), %zmm27 # 64-byte Reload | |
| vfmadd213ps %zmm12, %zmm20, %zmm19 # zmm19 = (zmm20 * zmm19) + zmm12 | |
| vbroadcastsd %xmm0, %zmm12 | |
| vshufps $255, %xmm25, %xmm25, %xmm0 # xmm0 = xmm25[3,3,3,3] | |
| vfmadd213ps %zmm14, %zmm20, %zmm8 # zmm8 = (zmm20 * zmm8) + zmm14 | |
| vbroadcastsd %xmm0, %zmm14 | |
| vfmadd213ps %zmm16, %zmm20, %zmm12 # zmm12 = (zmm20 * zmm12) + zmm16 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovups (%rsi,%rax), %zmm16 {%k1} {z} | |
| movq (%rsp), %rax # 8-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd213ps %zmm15, %zmm20, %zmm14 # zmm14 = (zmm20 * zmm14) + zmm15 | |
| vshufps $255, %xmm27, %xmm27, %xmm0 # xmm0 = xmm27[3,3,3,3] | |
| vbroadcastsd %xmm0, %zmm15 | |
| vshufps $255, %xmm17, %xmm17, %xmm0 # xmm0 = xmm17[3,3,3,3] | |
| vfmadd213ps %zmm13, %zmm20, %zmm15 # zmm15 = (zmm20 * zmm15) + zmm13 | |
| vbroadcastsd %xmm0, %zmm13 | |
| vshufps $255, %xmm21, %xmm21, %xmm0 # xmm0 = xmm21[3,3,3,3] | |
| vfmadd213ps %zmm11, %zmm20, %zmm13 # zmm13 = (zmm20 * zmm13) + zmm11 | |
| vbroadcastsd %xmm0, %zmm11 | |
| vshufps $255, %xmm29, %xmm29, %xmm0 # xmm0 = xmm29[3,3,3,3] | |
| vbroadcastss %xmm0, %zmm17 | |
| vmovaps .LCPI0_117(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,9,19,u,u,u,u,u] | |
| vfmadd213ps %zmm9, %zmm20, %zmm11 # zmm11 = (zmm20 * zmm11) + zmm9 | |
| vfmadd213ps %zmm23, %zmm20, %zmm17 # zmm17 = (zmm20 * zmm17) + zmm23 | |
| vpermt2ps %zmm27, %zmm0, %zmm1 | |
| vextractf128 $1, %ymm1, %xmm0 | |
| vbroadcastss %xmm0, %zmm9 | |
| vfmadd213ps 32(%rsp), %zmm20, %zmm9 # 64-byte Folded Reload | |
| # zmm9 = (zmm20 * zmm9) + mem | |
| vmovups 1776(%rsp), %zmm20 # 64-byte Reload | |
| vunpcklps %ymm20, %ymm29, %ymm0 # ymm0 = ymm29[0],ymm20[0],ymm29[1],ymm20[1],ymm29[4],ymm20[4],ymm29[5],ymm20[5] | |
| vextractf128 $1, %ymm0, %xmm1 | |
| vextractf128 $1, %ymm4, %xmm0 | |
| vextractf128 $1, %ymm6, %xmm4 | |
| vmovaps %xmm0, 880(%rsp) # 16-byte Spill | |
| vmovlhps %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0] | |
| vmovaps %xmm4, 2032(%rsp) # 16-byte Spill | |
| vinsertps $48, %xmm4, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm4[0] | |
| vextractf128 $1, %ymm3, %xmm4 | |
| vmovaps %zmm31, %zmm3 | |
| vblendps $240, %ymm5, %ymm0, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] | |
| vmovaps %zmm30, %zmm5 | |
| vbroadcastss %xmm4, %ymm4 | |
| vblendps $32, %ymm4, %ymm0, %ymm4 # ymm4 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] | |
| vmovapd .LCPI0_118(%rip), %ymm0 # ymm0 = [0,1,2,6] | |
| vpermt2pd 96(%rsp), %ymm0, %ymm4 # 32-byte Folded Reload | |
| vextractf128 $1, %ymm2, %xmm0 | |
| vbroadcastss %xmm0, %zmm0 | |
| vfmadd231ps %zmm0, %zmm16, %zmm18 # zmm18 = (zmm16 * zmm0) + zmm18 | |
| vblendps $128, %ymm0, %ymm4, %ymm0 # ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] | |
| vshufps $170, %ymm4, %ymm4, %ymm4 # ymm4 = ymm4[2,2,2,2,6,6,6,6] | |
| vshuff64x2 $85, %zmm4, %zmm4, %zmm21 # zmm21 = zmm4[2,3,2,3,2,3,2,3] | |
| vbroadcastss %xmm1, %zmm4 | |
| vextractf128 $1, %ymm3, %xmm1 | |
| vshuff64x2 $212, %zmm31, %zmm0, %zmm0 # zmm0 = zmm0[0,1,2,3],zmm31[2,3,6,7] | |
| vmovups 288(%rsp), %zmm3 # 64-byte Reload | |
| vmovups 784(%rsp), %ymm31 # 32-byte Reload | |
| vfmadd213ps %zmm24, %zmm16, %zmm21 # zmm21 = (zmm16 * zmm21) + zmm24 | |
| vbroadcastss %xmm1, %zmm24 | |
| vextractf128 $1, %ymm5, %xmm1 | |
| vfmadd213ps %zmm17, %zmm16, %zmm4 # zmm4 = (zmm16 * zmm4) + zmm17 | |
| vbroadcastss %xmm1, %zmm23 | |
| vextractf32x4 $1, %ymm27, %xmm1 | |
| vfmadd213ps %zmm11, %zmm16, %zmm24 # zmm24 = (zmm16 * zmm24) + zmm11 | |
| vbroadcastss %xmm1, %zmm2 | |
| vmovaps .LCPI0_119(%rip), %zmm1 # zmm1 = [0,1,2,3,4,5,6,7,8,20,u,u,u,u,u,u] | |
| vfmadd213ps %zmm13, %zmm16, %zmm23 # zmm23 = (zmm16 * zmm23) + zmm13 | |
| vextractf128 $1, %ymm3, %xmm5 | |
| vmovups 416(%rsp), %zmm3 # 64-byte Reload | |
| vfmadd213ps %zmm15, %zmm16, %zmm2 # zmm2 = (zmm16 * zmm2) + zmm15 | |
| vbroadcastss %xmm5, %zmm5 | |
| vfmadd213ps %zmm8, %zmm16, %zmm5 # zmm5 = (zmm16 * zmm5) + zmm8 | |
| vmovupd 2288(%rsp), %zmm8 # 64-byte Reload | |
| vpermt2ps %zmm30, %zmm1, %zmm0 | |
| vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3] | |
| vbroadcastsd %xmm1, %zmm28 | |
| vextractf32x4 $1, %ymm25, %xmm1 | |
| vmovaps %zmm2, %zmm25 | |
| vbroadcastss %xmm1, %zmm26 | |
| vshufps $255, %xmm0, %xmm0, %xmm1 # xmm1 = xmm0[3,3,3,3] | |
| vfmadd213ps %zmm10, %zmm16, %zmm28 # zmm28 = (zmm16 * zmm28) + zmm10 | |
| vinsertps $76, %xmm24, %xmm23, %xmm10 # xmm10 = xmm24[1],xmm23[1],zero,zero | |
| vbroadcastsd %xmm1, %zmm17 | |
| vmovups 1328(%rsp), %zmm1 # 64-byte Reload | |
| vfmadd213ps %zmm14, %zmm16, %zmm26 # zmm26 = (zmm16 * zmm26) + zmm14 | |
| vfmadd213ps %zmm7, %zmm16, %zmm17 # zmm17 = (zmm16 * zmm17) + zmm7 | |
| vshufps $85, %ymm0, %ymm0, %ymm7 # ymm7 = ymm0[1,1,1,1,5,5,5,5] | |
| vshuff64x2 $85, %zmm7, %zmm7, %zmm27 # zmm27 = zmm7[2,3,2,3,2,3,2,3] | |
| vextractf128 $1, %ymm3, %xmm7 | |
| vpermpd $170, %ymm6, %ymm3 # ymm3 = ymm6[2,2,2,2] | |
| vunpcklps %xmm23, %xmm24, %xmm6 # xmm6 = xmm24[0],xmm23[0],xmm24[1],xmm23[1] | |
| vfmadd213ps 160(%rsp), %zmm16, %zmm27 # 64-byte Folded Reload | |
| # zmm27 = (zmm16 * zmm27) + mem | |
| vbroadcastss %xmm7, %zmm13 | |
| vfmadd213ps %zmm19, %zmm16, %zmm13 # zmm13 = (zmm16 * zmm13) + zmm19 | |
| vunpcklps %zmm23, %zmm24, %zmm19 # zmm19 = zmm24[0],zmm23[0],zmm24[1],zmm23[1],zmm24[4],zmm23[4],zmm24[5],zmm23[5],zmm24[8],zmm23[8],zmm24[9],zmm23[9],zmm24[12],zmm23[12],zmm24[13],zmm23[13] | |
| vextractf128 $1, %ymm1, %xmm1 | |
| vbroadcastss %xmm1, %zmm14 | |
| vshufps $170, %xmm0, %xmm0, %xmm1 # xmm1 = xmm0[2,2,2,2] | |
| vextractf128 $1, %ymm0, %xmm0 | |
| vbroadcastsd %xmm1, %zmm1 | |
| vbroadcastss %xmm0, %zmm7 | |
| vmovups 560(%rsp), %zmm0 # 64-byte Reload | |
| vfmadd213ps %zmm12, %zmm16, %zmm14 # zmm14 = (zmm16 * zmm14) + zmm12 | |
| vmovapd %zmm8, %zmm12 | |
| vfmadd213ps %zmm22, %zmm16, %zmm1 # zmm1 = (zmm16 * zmm1) + zmm22 | |
| vfmadd213ps %zmm9, %zmm16, %zmm7 # zmm7 = (zmm16 * zmm7) + zmm9 | |
| vinsertps $76, %xmm4, %xmm28, %xmm9 # xmm9 = xmm4[1],xmm28[1],zero,zero | |
| vextractf128 $1, %ymm0, %xmm0 | |
| vbroadcastss %xmm0, %zmm11 | |
| vmovaps %ymm29, %ymm0 | |
| vpermt2ps %ymm20, %ymm31, %ymm0 | |
| vfmadd213ps 224(%rsp), %zmm16, %zmm11 # 64-byte Folded Reload | |
| # zmm11 = (zmm16 * zmm11) + mem | |
| vmovaps %zmm13, %zmm16 | |
| vmovaps .LCPI0_18(%rip), %zmm29 # zmm29 = [1,17,2,3,5,21,6,7,9,25,10,11,13,29,14,15] | |
| vmovaps .LCPI0_15(%rip), %zmm20 # zmm20 = [3,19,2,3,7,23,6,7,11,27,10,11,15,31,14,15] | |
| vextractf128 $1, %ymm0, %xmm0 | |
| vshufps $212, 880(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload | |
| # xmm0 = xmm0[0,1],mem[1,3] | |
| vblendps $8, %xmm3, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm3[3] | |
| vunpcklps %xmm5, %xmm14, %xmm3 # xmm3 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] | |
| vmovups %ymm0, 224(%rsp) # 32-byte Spill | |
| vunpcklps %xmm11, %xmm13, %xmm0 # xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] | |
| vpermt2pd %zmm0, %zmm8, %zmm3 | |
| vunpcklps %xmm26, %xmm2, %xmm0 # xmm0 = xmm2[0],xmm26[0],xmm2[1],xmm26[1] | |
| vpermt2pd %zmm0, %zmm8, %zmm6 | |
| vmovaps %xmm18, %xmm0 | |
| vunpcklps %xmm28, %xmm4, %xmm8 # xmm8 = xmm4[0],xmm28[0],xmm4[1],xmm28[1] | |
| vmovapd %zmm3, %zmm6 {%k4} | |
| vmovddup .LCPI0_194(%rip), %xmm3 # xmm3 = [4,0,4,0] | |
| # xmm3 = mem[0,0] | |
| vpermt2ps %xmm21, %xmm3, %xmm0 | |
| vunpcklps %xmm27, %xmm7, %xmm3 # xmm3 = xmm7[0],xmm27[0],xmm7[1],xmm27[1] | |
| vinsertf128 $1, %xmm0, %ymm0, %ymm0 | |
| vinsertf128 $1, %xmm3, %ymm0, %ymm3 | |
| vblendps $192, %ymm0, %ymm3, %ymm0 # ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] | |
| vmovlhps %xmm1, %xmm17, %xmm3 # xmm3 = xmm17[0],xmm1[0] | |
| vshufps $36, %xmm3, %xmm8, %xmm3 # xmm3 = xmm8[0,1],xmm3[2,0] | |
| vinsertps $76, %xmm14, %xmm5, %xmm8 # xmm8 = xmm14[1],xmm5[1],zero,zero | |
| vblendps $15, %ymm3, %ymm0, %ymm0 # ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] | |
| vinsertps $76, %xmm13, %xmm11, %xmm3 # xmm3 = xmm13[1],xmm11[1],zero,zero | |
| vpermt2pd %zmm3, %zmm12, %zmm8 | |
| vinsertps $76, %xmm2, %xmm26, %xmm3 # xmm3 = xmm2[1],xmm26[1],zero,zero | |
| vpermt2pd %zmm3, %zmm12, %zmm10 | |
| vunpcklps %xmm18, %xmm21, %xmm3 # xmm3 = xmm21[0],xmm18[0],xmm21[1],xmm18[1] | |
| vinsertf128 $1, %xmm3, %ymm0, %ymm3 | |
| vmovapd %zmm8, %zmm10 {%k4} | |
| vinsertps $76, %xmm7, %xmm27, %xmm8 # xmm8 = xmm7[1],xmm27[1],zero,zero | |
| vinsertf128 $1, %xmm8, %ymm0, %ymm8 | |
| vinsertf64x4 $0, %ymm0, %zmm6, %zmm0 | |
| vunpckhps %xmm23, %xmm24, %xmm6 # xmm6 = xmm24[2],xmm23[2],xmm24[3],xmm23[3] | |
| vblendps $192, %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] | |
| vunpcklps %xmm17, %xmm1, %xmm8 # xmm8 = xmm1[0],xmm17[0],xmm1[1],xmm17[1] | |
| vmovupd %zmm0, 1008(%rsp) # 64-byte Spill | |
| vblendps $3, %xmm9, %xmm8, %xmm8 # xmm8 = xmm9[0,1],xmm8[2,3] | |
| vunpckhps %xmm28, %xmm4, %xmm9 # xmm9 = xmm4[2],xmm28[2],xmm4[3],xmm28[3] | |
| vblendps $15, %ymm8, %ymm3, %ymm0 # ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] | |
| vunpckhps %xmm11, %xmm13, %xmm3 # xmm3 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] | |
| vunpckhps %xmm5, %xmm14, %xmm8 # xmm8 = xmm14[2],xmm5[2],xmm14[3],xmm5[3] | |
| vpermt2pd %zmm3, %zmm12, %zmm8 | |
| vunpckhps %xmm26, %xmm2, %xmm3 # xmm3 = xmm2[2],xmm26[2],xmm2[3],xmm26[3] | |
| vpermt2pd %zmm3, %zmm12, %zmm6 | |
| vinsertps $179, %xmm18, %xmm21, %xmm3 # xmm3 = zero,zero,xmm21[2],xmm18[2] | |
| vinsertf128 $1, %xmm3, %ymm0, %ymm3 | |
| vmovapd %zmm8, %zmm6 {%k4} | |
| vunpckhps %xmm27, %xmm7, %xmm8 # xmm8 = xmm7[2],xmm27[2],xmm7[3],xmm27[3] | |
| vinsertf128 $1, %xmm8, %ymm0, %ymm8 | |
| vinsertf64x4 $0, %ymm0, %zmm10, %zmm0 | |
| vmovaps .LCPI0_19(%rip), %zmm10 # zmm10 = [0,1,0,16,4,5,4,20,8,9,8,24,12,13,12,28] | |
| vblendps $192, %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] | |
| vinsertps $179, %xmm17, %xmm1, %xmm8 # xmm8 = zero,zero,xmm1[2],xmm17[2] | |
| vmovupd %zmm0, 1520(%rsp) # 64-byte Spill | |
| vblendps $3, %xmm9, %xmm8, %xmm8 # xmm8 = xmm9[0,1],xmm8[2,3] | |
| vblendps $15, %ymm8, %ymm3, %ymm9 # ymm9 = ymm8[0,1,2,3],ymm3[4,5,6,7] | |
| vmovapd .LCPI0_20(%rip), %zmm3 # zmm3 = [0,8,0,8,4,12,4,13] | |
| vunpcklps %zmm5, %zmm14, %zmm8 # zmm8 = zmm14[0],zmm5[0],zmm14[1],zmm5[1],zmm14[4],zmm5[4],zmm14[5],zmm5[5],zmm14[8],zmm5[8],zmm14[9],zmm5[9],zmm14[12],zmm5[12],zmm14[13],zmm5[13] | |
| vmovups %zmm8, 2544(%rsp) # 64-byte Spill | |
| vinsertf64x4 $0, %ymm9, %zmm6, %zmm6 | |
| vmovaps %zmm2, %zmm9 | |
| vmovupd %zmm6, 32(%rsp) # 64-byte Spill | |
| vunpckhps %zmm23, %zmm24, %zmm6 # zmm6 = zmm24[2],zmm23[2],zmm24[3],zmm23[3],zmm24[6],zmm23[6],zmm24[7],zmm23[7],zmm24[10],zmm23[10],zmm24[11],zmm23[11],zmm24[14],zmm23[14],zmm24[15],zmm23[15] | |
| vpermt2ps %zmm11, %zmm10, %zmm16 | |
| vpermt2ps %zmm26, %zmm10, %zmm25 | |
| vmovaps %zmm10, %zmm12 | |
| vpermt2pd %zmm16, %zmm3, %zmm8 | |
| vshufpd $32, %zmm25, %zmm19, %zmm0 # zmm0 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[5],zmm19[6],zmm25[6] | |
| vmovapd %zmm8, %zmm0 {%k4} | |
| vmovaps %zmm21, %zmm8 | |
| vpermt2ps %zmm18, %zmm10, %zmm8 | |
| vunpcklps %zmm27, %zmm7, %zmm10 # zmm10 = zmm7[0],zmm27[0],zmm7[1],zmm27[1],zmm7[4],zmm27[4],zmm7[5],zmm27[5],zmm7[8],zmm27[8],zmm7[9],zmm27[9],zmm7[12],zmm27[12],zmm7[13],zmm27[13] | |
| vmovups %zmm10, 816(%rsp) # 64-byte Spill | |
| vshuff64x2 $170, %zmm10, %zmm10, %zmm10 # zmm10 = zmm10[4,5,4,5,4,5,4,5] | |
| vmovups %zmm8, 352(%rsp) # 64-byte Spill | |
| vshuff64x2 $170, %zmm8, %zmm8, %zmm8 # zmm8 = zmm8[4,5,4,5,4,5,4,5] | |
| vblendpd $8, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0,1,2],ymm8[3] | |
| vmovaps %zmm1, %zmm10 | |
| vpermt2ps %zmm17, %zmm12, %zmm10 | |
| vunpcklps %zmm28, %zmm4, %zmm12 # zmm12 = zmm4[0],zmm28[0],zmm4[1],zmm28[1],zmm4[4],zmm28[4],zmm4[5],zmm28[5],zmm4[8],zmm28[8],zmm4[9],zmm28[9],zmm4[12],zmm28[12],zmm4[13],zmm28[13] | |
| vmovups %zmm12, 3120(%rsp) # 64-byte Spill | |
| vextractf32x4 $2, %zmm12, %xmm12 | |
| vmovups %zmm10, 3184(%rsp) # 64-byte Spill | |
| vextractf32x4 $2, %zmm10, %xmm10 | |
| vblendps $3, %xmm12, %xmm10, %xmm10 # xmm10 = xmm12[0,1],xmm10[2,3] | |
| vblendpd $3, %ymm10, %ymm8, %ymm8 # ymm8 = ymm10[0,1],ymm8[2,3] | |
| vunpcklps %zmm17, %zmm1, %zmm10 # zmm10 = zmm1[0],zmm17[0],zmm1[1],zmm17[1],zmm1[4],zmm17[4],zmm1[5],zmm17[5],zmm1[8],zmm17[8],zmm1[9],zmm17[9],zmm1[12],zmm17[12],zmm1[13],zmm17[13] | |
| vmovapd %ymm8, %ymm22 | |
| vmovaps %zmm4, %zmm8 | |
| vpermt2ps %zmm28, %zmm29, %zmm8 | |
| vmovups %zmm10, 3056(%rsp) # 64-byte Spill | |
| vextractf32x4 $2, %zmm10, %xmm10 | |
| vextractf32x4 $2, %zmm8, %xmm12 | |
| vmovups %zmm8, 2992(%rsp) # 64-byte Spill | |
| vblendps $3, %xmm12, %xmm10, %xmm8 # xmm8 = xmm12[0,1],xmm10[2,3] | |
| vmovaps %zmm13, %zmm12 | |
| vunpckhps %zmm5, %zmm14, %zmm10 # zmm10 = zmm14[2],zmm5[2],zmm14[3],zmm5[3],zmm14[6],zmm5[6],zmm14[7],zmm5[7],zmm14[10],zmm5[10],zmm14[11],zmm5[11],zmm14[14],zmm5[14],zmm14[15],zmm5[15] | |
| vmovups %ymm8, 1424(%rsp) # 32-byte Spill | |
| vmovaps .LCPI0_17(%rip), %zmm8 # zmm8 = [0,1,2,18,4,5,6,22,8,9,10,26,12,13,14,30] | |
| vmovups %zmm10, 2480(%rsp) # 64-byte Spill | |
| vpermt2ps %zmm11, %zmm8, %zmm12 | |
| vpermt2ps %zmm26, %zmm8, %zmm9 | |
| vpermt2pd %zmm12, %zmm3, %zmm10 | |
| vmovaps %zmm21, %zmm3 | |
| vpermt2ps %zmm18, %zmm8, %zmm3 | |
| vshufpd $32, %zmm9, %zmm6, %zmm15 # zmm15 = zmm6[0],zmm9[0],zmm6[2],zmm9[2],zmm6[4],zmm9[5],zmm6[6],zmm9[6] | |
| vmovapd %zmm10, %zmm15 {%k4} | |
| vmovups %zmm3, 2928(%rsp) # 64-byte Spill | |
| vshuff64x2 $170, %zmm3, %zmm3, %zmm10 # zmm10 = zmm3[4,5,4,5,4,5,4,5] | |
| vunpckhps %zmm27, %zmm7, %zmm3 # zmm3 = zmm7[2],zmm27[2],zmm7[3],zmm27[3],zmm7[6],zmm27[6],zmm7[7],zmm27[7],zmm7[10],zmm27[10],zmm7[11],zmm27[11],zmm7[14],zmm27[14],zmm7[15],zmm27[15] | |
| vmovups %zmm3, 2864(%rsp) # 64-byte Spill | |
| vshuff64x2 $170, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[4,5,4,5,4,5,4,5] | |
| vinsertf64x4 $0, %ymm22, %zmm0, %zmm0 | |
| vblendpd $8, %ymm10, %ymm3, %ymm3 # ymm3 = ymm3[0,1,2],ymm10[3] | |
| vmovaps %zmm1, %zmm10 | |
| vpermt2ps %zmm17, %zmm8, %zmm10 | |
| vunpckhps %zmm28, %zmm4, %zmm8 # zmm8 = zmm4[2],zmm28[2],zmm4[3],zmm28[3],zmm4[6],zmm28[6],zmm4[7],zmm28[7],zmm4[10],zmm28[10],zmm4[11],zmm28[11],zmm4[14],zmm28[14],zmm4[15],zmm28[15] | |
| vmovupd %zmm0, 944(%rsp) # 64-byte Spill | |
| vunpcklps %ymm17, %ymm1, %ymm0 # ymm0 = ymm1[0],ymm17[0],ymm1[1],ymm17[1],ymm1[4],ymm17[4],ymm1[5],ymm17[5] | |
| vmovups %zmm8, 2736(%rsp) # 64-byte Spill | |
| vextractf32x4 $2, %zmm8, %xmm8 | |
| vpermpd $170, %ymm0, %ymm0 # ymm0 = ymm0[2,2,2,2] | |
| vmovups %zmm10, 2800(%rsp) # 64-byte Spill | |
| vextractf32x4 $2, %zmm10, %xmm10 | |
| vblendps $3, %xmm8, %xmm10, %xmm8 # xmm8 = xmm8[0,1],xmm10[2,3] | |
| vblendpd $3, %ymm8, %ymm3, %ymm10 # ymm10 = ymm8[0,1],ymm3[2,3] | |
| vmovaps %zmm4, %zmm8 | |
| vpermt2ps %zmm28, %zmm20, %zmm8 | |
| vunpckhps %zmm17, %zmm1, %zmm3 # zmm3 = zmm1[2],zmm17[2],zmm1[3],zmm17[3],zmm1[6],zmm17[6],zmm1[7],zmm17[7],zmm1[10],zmm17[10],zmm1[11],zmm17[11],zmm1[14],zmm17[14],zmm1[15],zmm17[15] | |
| vmovups %zmm3, 2672(%rsp) # 64-byte Spill | |
| vextractf32x4 $2, %zmm3, %xmm3 | |
| vinsertf64x4 $0, %ymm10, %zmm15, %zmm22 | |
| vmovaps %zmm7, %zmm10 | |
| vpermt2ps %zmm27, %zmm29, %zmm10 | |
| vmovaps %zmm7, %zmm15 | |
| vpermt2ps %zmm27, %zmm20, %zmm15 | |
| vmovaps .LCPI0_26(%rip), %ymm20 # ymm20 = [0,1,0,8,4,5,4,12] | |
| vunpcklps %ymm5, %ymm14, %ymm29 # ymm29 = ymm14[0],ymm5[0],ymm14[1],ymm5[1],ymm14[4],ymm5[4],ymm14[5],ymm5[5] | |
| vmovups %zmm8, 2608(%rsp) # 64-byte Spill | |
| vextractf32x4 $2, %zmm8, %xmm8 | |
| vblendps $3, %xmm8, %xmm3, %xmm3 # xmm3 = xmm8[0,1],xmm3[2,3] | |
| vmovups %ymm3, 2448(%rsp) # 32-byte Spill | |
| vunpcklps %ymm28, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm28[0],ymm4[1],ymm28[1],ymm4[4],ymm28[4],ymm4[5],ymm28[5] | |
| vextractf128 $1, %ymm3, %xmm3 | |
| vblendps $3, %xmm3, %xmm0, %xmm0 # xmm0 = xmm3[0,1],xmm0[2,3] | |
| vunpcklps %zmm18, %zmm21, %zmm3 # zmm3 = zmm21[0],zmm18[0],zmm21[1],zmm18[1],zmm21[4],zmm18[4],zmm21[5],zmm18[5],zmm21[8],zmm18[8],zmm21[9],zmm18[9],zmm21[12],zmm18[12],zmm21[13],zmm18[13] | |
| vmovups %zmm3, 3376(%rsp) # 64-byte Spill | |
| vshuff64x2 $170, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $170, %zmm10, %zmm10, %zmm8 # zmm8 = zmm10[4,5,4,5,4,5,4,5] | |
| vblendpd $8, %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[0,1,2],ymm3[3] | |
| vshuff64x2 $170, %zmm15, %zmm15, %zmm8 # zmm8 = zmm15[4,5,4,5,4,5,4,5] | |
| vmovapd %ymm3, %ymm30 | |
| vunpckhps %zmm18, %zmm21, %zmm3 # zmm3 = zmm21[2],zmm18[2],zmm21[3],zmm18[3],zmm21[6],zmm18[6],zmm21[7],zmm18[7],zmm21[10],zmm18[10],zmm21[11],zmm18[11],zmm21[14],zmm18[14],zmm21[15],zmm18[15] | |
| vmovups %zmm3, 3312(%rsp) # 64-byte Spill | |
| vshuff64x2 $170, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[4,5,4,5,4,5,4,5] | |
| vblendpd $8, %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[0,1,2],ymm3[3] | |
| vunpcklps %ymm27, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm27[0],ymm7[1],ymm27[1],ymm7[4],ymm27[4],ymm7[5],ymm27[5] | |
| vmovupd %ymm3, 2416(%rsp) # 32-byte Spill | |
| vunpcklpd %ymm21, %ymm18, %ymm3 # ymm3 = ymm18[0],ymm21[0],ymm18[2],ymm21[2] | |
| vshufps $36, %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[0,1],ymm3[2,0],ymm8[4,5],ymm3[6,4] | |
| vmovups 2160(%rsp), %zmm8 # 64-byte Reload | |
| vblendps $15, %ymm0, %ymm3, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] | |
| vmovaps %zmm2, %zmm3 | |
| vinsertf64x4 $1, %ymm29, %zmm0, %zmm29 | |
| vpermt2ps %zmm26, %zmm8, %zmm3 | |
| vunpcklps %ymm23, %ymm24, %ymm8 # ymm8 = ymm24[0],ymm23[0],ymm24[1],ymm23[1],ymm24[4],ymm23[4],ymm24[5],ymm23[5] | |
| vshuff64x2 $85, %zmm8, %zmm8, %zmm8 # zmm8 = zmm8[2,3,2,3,2,3,2,3] | |
| vshufpd $32, %zmm3, %zmm8, %zmm3 # zmm3 = zmm8[0],zmm3[0],zmm8[2],zmm3[2],zmm8[4],zmm3[5],zmm8[6],zmm3[6] | |
| vmovaps %ymm13, %ymm8 | |
| vpermt2ps %ymm11, %ymm20, %ymm8 | |
| vmovaps .LCPI0_24(%rip), %ymm20 # ymm20 = [0,1,2,10,4,5,6,14] | |
| vinsertf64x4 $1, %ymm8, %zmm0, %zmm8 | |
| vshufpd $128, %zmm8, %zmm29, %zmm3 {%k4} # zmm3 {%k4} = zmm29[0],zmm8[0],zmm29[2],zmm8[2],zmm29[4],zmm8[4],zmm29[6],zmm8[7] | |
| vunpckhps %ymm27, %ymm7, %ymm8 # ymm8 = ymm7[2],ymm27[2],ymm7[3],ymm27[3],ymm7[6],ymm27[6],ymm7[7],ymm27[7] | |
| vunpckhps %ymm5, %ymm14, %ymm29 # ymm29 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7] | |
| vinsertf64x4 $0, %ymm0, %zmm3, %zmm0 | |
| vunpckhps %ymm28, %ymm4, %ymm3 # ymm3 = ymm4[2],ymm28[2],ymm4[3],ymm28[3],ymm4[6],ymm28[6],ymm4[7],ymm28[7] | |
| vextractf128 $1, %ymm3, %xmm3 | |
| vmovupd %zmm0, 1456(%rsp) # 64-byte Spill | |
| vunpckhps %ymm17, %ymm1, %ymm0 # ymm0 = ymm1[2],ymm17[2],ymm1[3],ymm17[3],ymm1[6],ymm17[6],ymm1[7],ymm17[7] | |
| vpermpd $170, %ymm0, %ymm0 # ymm0 = ymm0[2,2,2,2] | |
| vblendps $3, %xmm3, %xmm0, %xmm0 # xmm0 = xmm3[0,1],xmm0[2,3] | |
| vunpckhpd %ymm21, %ymm18, %ymm3 # ymm3 = ymm18[1],ymm21[1],ymm18[3],ymm21[3] | |
| vshufps $36, %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[0,1],ymm3[2,0],ymm8[4,5],ymm3[6,4] | |
| vmovups 2224(%rsp), %zmm8 # 64-byte Reload | |
| vblendps $15, %ymm0, %ymm3, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] | |
| vmovaps %zmm2, %zmm3 | |
| vinsertf64x4 $1, %ymm29, %zmm0, %zmm29 | |
| vpermt2ps %zmm26, %zmm8, %zmm3 | |
| vunpckhps %ymm23, %ymm24, %ymm8 # ymm8 = ymm24[2],ymm23[2],ymm24[3],ymm23[3],ymm24[6],ymm23[6],ymm24[7],ymm23[7] | |
| vshuff64x2 $85, %zmm8, %zmm8, %zmm8 # zmm8 = zmm8[2,3,2,3,2,3,2,3] | |
| vshufpd $32, %zmm3, %zmm8, %zmm3 # zmm3 = zmm8[0],zmm3[0],zmm8[2],zmm3[2],zmm8[4],zmm3[5],zmm8[6],zmm3[6] | |
| vmovaps %ymm13, %ymm8 | |
| vpermt2ps %ymm11, %ymm20, %ymm8 | |
| vinsertf64x4 $1, %ymm8, %zmm0, %zmm8 | |
| vshufpd $128, %zmm8, %zmm29, %zmm3 {%k4} # zmm3 {%k4} = zmm29[0],zmm8[0],zmm29[2],zmm8[2],zmm29[4],zmm8[4],zmm29[6],zmm8[7] | |
| vmovsd .LCPI0_195(%rip), %xmm8 # xmm8 = [3,7,0,0] | |
| vmovaps %ymm14, %ymm29 | |
| vinsertf64x4 $0, %ymm0, %zmm3, %zmm0 | |
| vmovaps %xmm7, %xmm3 | |
| vmovupd %zmm0, 160(%rsp) # 64-byte Spill | |
| vunpckhps %xmm18, %xmm21, %xmm0 # xmm0 = xmm21[2],xmm18[2],xmm21[3],xmm18[3] | |
| vinsertf128 $1, %xmm0, %ymm0, %ymm0 | |
| vpermt2ps %xmm27, %xmm8, %xmm3 | |
| vshufps $51, %xmm4, %xmm28, %xmm8 # xmm8 = xmm28[3,0],xmm4[3,0] | |
| vinsertf128 $1, %xmm3, %ymm0, %ymm3 | |
| vblendps $192, %ymm0, %ymm3, %ymm0 # ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] | |
| vunpckhps %xmm17, %xmm1, %xmm3 # xmm3 = xmm1[2],xmm17[2],xmm1[3],xmm17[3] | |
| vshufps $226, %xmm3, %xmm8, %xmm3 # xmm3 = xmm8[2,0],xmm3[2,3] | |
| vbroadcastsd .LCPI0_30(%rip), %ymm8 # ymm8 = [5,13,5,13,5,13,5,13] | |
| vblendps $15, %ymm3, %ymm0, %ymm0 # ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] | |
| vmovaps .LCPI0_31(%rip), %ymm3 # ymm3 = [1,9,2,3,5,13,u,u] | |
| vmovaps %ymm0, %ymm20 | |
| vmovaps %ymm7, %ymm0 | |
| vpermt2ps %ymm27, %ymm3, %ymm0 | |
| vunpcklps %ymm18, %ymm21, %ymm3 # ymm3 = ymm21[0],ymm18[0],ymm21[1],ymm18[1],ymm21[4],ymm18[4],ymm21[5],ymm18[5] | |
| vblendps $192, %ymm3, %ymm0, %ymm0 # ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] | |
| vmovaps %ymm1, %ymm3 | |
| vpermt2ps %ymm17, %ymm8, %ymm3 | |
| vmovaps %ymm4, %ymm8 | |
| vpermt2ps %ymm28, %ymm31, %ymm8 | |
| vmovaps .LCPI0_27(%rip), %ymm31 # ymm31 = [1,9,2,3,5,13,6,7] | |
| vextractf128 $1, %ymm8, %xmm8 | |
| vblendps $3, %xmm8, %xmm3, %xmm3 # xmm3 = xmm8[0,1],xmm3[2,3] | |
| vmovups 2096(%rsp), %zmm8 # 64-byte Reload | |
| vblendps $15, %ymm3, %ymm0, %ymm0 # ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] | |
| vmovaps %zmm24, %zmm3 | |
| vpermt2ps %ymm5, %ymm31, %ymm29 | |
| vmovapd .LCPI0_16(%rip), %zmm31 # zmm31 = [2,10,2,10,6,15,6,14] | |
| vinsertf64x4 $1, %ymm29, %zmm0, %zmm29 | |
| vpermt2ps %zmm23, %zmm8, %zmm3 | |
| vunpcklps %ymm26, %ymm2, %ymm8 # ymm8 = ymm2[0],ymm26[0],ymm2[1],ymm26[1],ymm2[4],ymm26[4],ymm2[5],ymm26[5] | |
| vshuff64x2 $85, %zmm8, %zmm8, %zmm8 # zmm8 = zmm8[2,3,2,3,2,3,2,3] | |
| vpermt2pd %zmm25, %zmm31, %zmm19 | |
| vpermt2pd %zmm9, %zmm31, %zmm6 | |
| vmovaps %zmm24, %zmm9 | |
| vmovaps %zmm14, %zmm25 | |
| vshufpd $32, %zmm8, %zmm3, %zmm3 # zmm3 = zmm3[0],zmm8[0],zmm3[2],zmm8[2],zmm3[4],zmm8[5],zmm3[6],zmm8[6] | |
| vunpcklps %ymm11, %ymm13, %ymm8 # ymm8 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5] | |
| vinsertf64x4 $1, %ymm8, %zmm0, %zmm8 | |
| vshufpd $128, %zmm8, %zmm29, %zmm3 {%k4} # zmm3 {%k4} = zmm29[0],zmm8[0],zmm29[2],zmm8[2],zmm29[4],zmm8[4],zmm29[6],zmm8[7] | |
| vbroadcastsd .LCPI0_32(%rip), %zmm8 # zmm8 = [3,19,3,19,3,19,3,19,3,19,3,19,3,19,3,19] | |
| vmovups 1584(%rsp), %zmm29 # 64-byte Reload | |
| vinsertf64x4 $0, %ymm0, %zmm3, %zmm0 | |
| vmovaps %zmm24, %zmm3 | |
| vmovupd %zmm0, 1072(%rsp) # 64-byte Spill | |
| vmovupd 2544(%rsp), %zmm0 # 64-byte Reload | |
| vpermt2ps %zmm23, %zmm8, %zmm3 | |
| vshufpd $128, %zmm16, %zmm0, %zmm19 {%k4} # zmm19 {%k4} = zmm0[0],zmm16[0],zmm0[2],zmm16[2],zmm0[4],zmm16[4],zmm0[6],zmm16[7] | |
| vmovaps %zmm2, %zmm0 | |
| vpermt2ps %zmm26, %zmm8, %zmm0 | |
| vmovaps %zmm8, %zmm16 | |
| vshufpd $32, %zmm0, %zmm3, %zmm3 # zmm3 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[5],zmm3[6],zmm0[6] | |
| vmovaps %zmm13, %zmm0 | |
| vpermt2ps %zmm11, %zmm8, %zmm0 | |
| vmovaps %zmm14, %zmm8 | |
| vpermt2ps %zmm5, %zmm16, %zmm8 | |
| vmovaps .LCPI0_10(%rip), %ymm16 # ymm16 = [3,11,2,3,7,15,6,7] | |
| vshufpd $128, %zmm0, %zmm8, %zmm3 {%k4} # zmm3 {%k4} = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[7] | |
| vmovupd 2480(%rsp), %zmm0 # 64-byte Reload | |
| vmovups 2352(%rsp), %zmm8 # 64-byte Reload | |
| vshufpd $128, %zmm12, %zmm0, %zmm6 {%k4} # zmm6 {%k4} = zmm0[0],zmm12[0],zmm0[2],zmm12[2],zmm0[4],zmm12[4],zmm0[6],zmm12[7] | |
| vmovaps %zmm24, %zmm0 | |
| vpermt2ps %zmm23, %zmm8, %zmm24 | |
| vunpckhps %ymm26, %ymm2, %ymm8 # ymm8 = ymm2[2],ymm26[2],ymm2[3],ymm26[3],ymm2[6],ymm26[6],ymm2[7],ymm26[7] | |
| vunpckhps %ymm11, %ymm13, %ymm12 # ymm12 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] | |
| vshuff64x2 $85, %zmm8, %zmm8, %zmm8 # zmm8 = zmm8[2,3,2,3,2,3,2,3] | |
| vinsertf64x4 $1, %ymm12, %zmm0, %zmm12 | |
| vshufpd $32, %zmm8, %zmm24, %zmm8 # zmm8 = zmm24[0],zmm8[0],zmm24[2],zmm8[2],zmm24[4],zmm8[5],zmm24[6],zmm8[6] | |
| vmovaps %zmm14, %zmm24 | |
| vpermt2ps %ymm5, %ymm16, %ymm14 | |
| vmovaps .LCPI0_18(%rip), %zmm16 # zmm16 = [1,17,2,3,5,21,6,7,9,25,10,11,13,29,14,15] | |
| vinsertf64x4 $1, %ymm14, %zmm0, %zmm14 | |
| vshufpd $128, %zmm12, %zmm14, %zmm8 {%k4} # zmm8 {%k4} = zmm14[0],zmm12[0],zmm14[2],zmm12[2],zmm14[4],zmm12[4],zmm14[6],zmm12[7] | |
| vunpcklps %zmm11, %zmm13, %zmm12 # zmm12 = zmm13[0],zmm11[0],zmm13[1],zmm11[1],zmm13[4],zmm11[4],zmm13[5],zmm11[5],zmm13[8],zmm11[8],zmm13[9],zmm11[9],zmm13[12],zmm11[12],zmm13[13],zmm11[13] | |
| vunpckhps %zmm11, %zmm13, %zmm11 # zmm11 = zmm13[2],zmm11[2],zmm13[3],zmm11[3],zmm13[6],zmm11[6],zmm13[7],zmm11[7],zmm13[10],zmm11[10],zmm13[11],zmm11[11],zmm13[14],zmm11[14],zmm13[15],zmm11[15] | |
| vunpcklps %zmm26, %zmm2, %zmm13 # zmm13 = zmm2[0],zmm26[0],zmm2[1],zmm26[1],zmm2[4],zmm26[4],zmm2[5],zmm26[5],zmm2[8],zmm26[8],zmm2[9],zmm26[9],zmm2[12],zmm26[12],zmm2[13],zmm26[13] | |
| vunpckhps %zmm26, %zmm2, %zmm2 # zmm2 = zmm2[2],zmm26[2],zmm2[3],zmm26[3],zmm2[6],zmm26[6],zmm2[7],zmm26[7],zmm2[10],zmm26[10],zmm2[11],zmm26[11],zmm2[14],zmm26[14],zmm2[15],zmm26[15] | |
| vinsertf64x4 $0, %ymm20, %zmm3, %zmm26 | |
| vunpckhps %ymm18, %ymm21, %ymm3 # ymm3 = ymm21[2],ymm18[2],ymm21[3],ymm18[3],ymm21[6],ymm18[6],ymm21[7],ymm18[7] | |
| vmovapd %zmm22, %zmm20 | |
| vmovups 1648(%rsp), %zmm21 # 64-byte Reload | |
| vpermt2ps %zmm23, %zmm16, %zmm0 | |
| vpermt2ps %zmm5, %zmm16, %zmm24 | |
| vmovapd .LCPI0_20(%rip), %zmm16 # zmm16 = [0,8,0,8,4,12,4,13] | |
| vshufpd $32, %zmm13, %zmm0, %zmm14 # zmm14 = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[5],zmm0[6],zmm13[6] | |
| vpermt2pd %zmm13, %zmm31, %zmm0 | |
| vbroadcastsd .LCPI0_13(%rip), %ymm13 # ymm13 = [7,15,7,15,7,15,7,15] | |
| vshufpd $128, %zmm12, %zmm24, %zmm0 {%k4} # zmm0 {%k4} = zmm24[0],zmm12[0],zmm24[2],zmm12[2],zmm24[4],zmm12[4],zmm24[6],zmm12[7] | |
| vpermt2pd %zmm12, %zmm16, %zmm24 | |
| vmovaps .LCPI0_15(%rip), %zmm12 # zmm12 = [3,19,2,3,7,23,6,7,11,27,10,11,15,31,14,15] | |
| vpermt2ps %ymm17, %ymm13, %ymm1 | |
| vmovups 560(%rsp), %zmm17 # 64-byte Reload | |
| vmovapd %zmm24, %zmm14 {%k4} | |
| vmovups 1008(%rsp), %zmm24 # 64-byte Reload | |
| vpermt2ps %zmm23, %zmm12, %zmm9 | |
| vpermt2ps %zmm5, %zmm12, %zmm25 | |
| vmovapd %ymm30, %ymm5 | |
| vblendpd $3, 1424(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload | |
| # ymm5 = mem[0,1],ymm5[2,3] | |
| vmovups 1712(%rsp), %zmm23 # 64-byte Reload | |
| vmovups 944(%rsp), %zmm30 # 64-byte Reload | |
| vshufpd $32, %zmm2, %zmm9, %zmm12 # zmm12 = zmm9[0],zmm2[0],zmm9[2],zmm2[2],zmm9[4],zmm2[5],zmm9[6],zmm2[6] | |
| vpermt2pd %zmm2, %zmm31, %zmm9 | |
| vmovupd 2416(%rsp), %ymm2 # 32-byte Reload | |
| vmovups 32(%rsp), %zmm31 # 64-byte Reload | |
| vblendpd $3, 2448(%rsp), %ymm2, %ymm2 # 32-byte Folded Reload | |
| # ymm2 = mem[0,1],ymm2[2,3] | |
| vshufpd $128, %zmm11, %zmm25, %zmm9 {%k4} # zmm9 {%k4} = zmm25[0],zmm11[0],zmm25[2],zmm11[2],zmm25[4],zmm11[4],zmm25[6],zmm11[7] | |
| vpermt2pd %zmm11, %zmm16, %zmm25 | |
| vmovaps .LCPI0_14(%rip), %ymm11 # ymm11 = [3,11,2,3,7,15,u,u] | |
| vmovapd %zmm25, %zmm12 {%k4} | |
| vinsertf64x4 $0, %ymm2, %zmm12, %zmm18 | |
| vextractf64x4 $1, %zmm10, %ymm2 | |
| vmovups 688(%rsp), %zmm10 # 64-byte Reload | |
| vmovups 416(%rsp), %zmm12 # 64-byte Reload | |
| vmovups 1456(%rsp), %zmm25 # 64-byte Reload | |
| vpermt2ps %ymm27, %ymm11, %ymm7 | |
| vmovups 752(%rsp), %ymm11 # 32-byte Reload | |
| vinsertf64x4 $0, %ymm5, %zmm14, %zmm27 | |
| vblendps $192, %ymm3, %ymm7, %ymm3 # ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] | |
| vmovups 1136(%rsp), %zmm7 # 64-byte Reload | |
| vpermt2ps %ymm28, %ymm11, %ymm4 | |
| vmovups 1328(%rsp), %zmm11 # 64-byte Reload | |
| vmovups 288(%rsp), %zmm28 # 64-byte Reload | |
| vextractf128 $1, %ymm4, %xmm4 | |
| vblendps $3, %xmm4, %xmm1, %xmm1 # xmm1 = xmm4[0,1],xmm1[2,3] | |
| vmovupd 3120(%rsp), %zmm4 # 64-byte Reload | |
| vblendps $15, %ymm1, %ymm3, %ymm1 # ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] | |
| vmovupd 816(%rsp), %zmm3 # 64-byte Reload | |
| vinsertf64x4 $0, %ymm1, %zmm8, %zmm13 | |
| vmovupd 352(%rsp), %zmm1 # 64-byte Reload | |
| vmovups 96(%rsp), %zmm8 # 64-byte Reload | |
| vextractf32x4 $3, %zmm4, %xmm4 | |
| vextractf64x4 $1, %zmm3, %ymm3 | |
| vextractf64x4 $1, %zmm1, %ymm1 | |
| vblendpd $8, %ymm1, %ymm3, %ymm1 # ymm1 = ymm3[0,1,2],ymm1[3] | |
| vmovupd 3184(%rsp), %zmm3 # 64-byte Reload | |
| vextractf32x4 $3, %zmm3, %xmm3 | |
| vblendpd $1, %xmm4, %xmm3, %xmm3 # xmm3 = xmm4[0],xmm3[1] | |
| vmovupd 2736(%rsp), %zmm4 # 64-byte Reload | |
| vblendpd $3, %ymm3, %ymm1, %ymm1 # ymm1 = ymm3[0,1],ymm1[2,3] | |
| vmovupd 2992(%rsp), %zmm3 # 64-byte Reload | |
| vinsertf64x4 $0, %ymm1, %zmm19, %zmm14 | |
| vmovupd 3376(%rsp), %zmm1 # 64-byte Reload | |
| vextractf32x4 $3, %zmm4, %xmm4 | |
| vextractf32x4 $3, %zmm3, %xmm3 | |
| vextractf64x4 $1, %zmm1, %ymm1 | |
| vblendpd $8, %ymm1, %ymm2, %ymm1 # ymm1 = ymm2[0,1,2],ymm1[3] | |
| vmovupd 3056(%rsp), %zmm2 # 64-byte Reload | |
| vextractf32x4 $3, %zmm2, %xmm2 | |
| vblendpd $1, %xmm3, %xmm2, %xmm2 # xmm2 = xmm3[0],xmm2[1] | |
| vmovupd 2864(%rsp), %zmm3 # 64-byte Reload | |
| vblendpd $3, %ymm2, %ymm1, %ymm1 # ymm1 = ymm2[0,1],ymm1[2,3] | |
| vmovupd 2928(%rsp), %zmm2 # 64-byte Reload | |
| vinsertf64x4 $0, %ymm1, %zmm0, %zmm19 | |
| vmovaps .LCPI0_120(%rip), %ymm0 # ymm0 = [0,1,2,3,13,u,u,u] | |
| vmovups 224(%rsp), %ymm1 # 32-byte Reload | |
| vextractf64x4 $1, %zmm3, %ymm3 | |
| vextractf64x4 $1, %zmm2, %ymm2 | |
| vblendpd $8, %ymm2, %ymm3, %ymm2 # ymm2 = ymm3[0,1,2],ymm2[3] | |
| vmovupd 2800(%rsp), %zmm3 # 64-byte Reload | |
| vextractf32x4 $3, %zmm3, %xmm3 | |
| vblendpd $1, %xmm4, %xmm3, %xmm3 # xmm3 = xmm4[0],xmm3[1] | |
| vmovupd 2608(%rsp), %zmm4 # 64-byte Reload | |
| vblendpd $3, %ymm3, %ymm2, %ymm2 # ymm2 = ymm3[0,1],ymm2[2,3] | |
| vextractf64x4 $1, %zmm15, %ymm3 | |
| vmovups 1264(%rsp), %zmm15 # 64-byte Reload | |
| vinsertf64x4 $0, %ymm2, %zmm6, %zmm22 | |
| vmovupd 3312(%rsp), %zmm2 # 64-byte Reload | |
| vmovaps .LCPI0_122(%rip), %ymm6 # ymm6 = [0,1,2,3,4,5,6,13] | |
| vextractf32x4 $3, %zmm4, %xmm4 | |
| vpermt2ps %ymm15, %ymm0, %ymm1 | |
| vextractf64x4 $1, %zmm2, %ymm2 | |
| vblendpd $8, %ymm2, %ymm3, %ymm2 # ymm2 = ymm3[0,1,2],ymm2[3] | |
| vmovupd 2672(%rsp), %zmm3 # 64-byte Reload | |
| vblendps $32, %ymm10, %ymm1, %ymm0 # ymm0 = ymm1[0,1,2,3,4],ymm10[5],ymm1[6,7] | |
| vmovaps .LCPI0_121(%rip), %ymm1 # ymm1 = [0,1,2,3,4,5,13,u] | |
| vextractf32x4 $3, %zmm3, %xmm3 | |
| vpermt2ps %ymm8, %ymm1, %ymm0 | |
| vmovups 624(%rsp), %zmm1 # 64-byte Reload | |
| vblendpd $1, %xmm4, %xmm3, %xmm3 # xmm3 = xmm4[0],xmm3[1] | |
| vmovaps .LCPI0_123(%rip), %zmm4 # zmm4 = [0,1,2,3,4,5,6,7,21,u,u,u,u,u,u,u] | |
| vblendpd $3, %ymm3, %ymm2, %ymm2 # ymm2 = ymm3[0,1],ymm2[2,3] | |
| vmovaps .LCPI0_124(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,21,u,u,u,u,u,u] | |
| vinsertf64x4 $0, %ymm2, %zmm9, %zmm16 | |
| vmovdqu64 1200(%rsp), %zmm9 # 64-byte Reload | |
| .loc 1 222 51 # 03-matrix-multiplication-cpu.py:222:51 | |
| vpcmpgtd .LCPI0_106(%rip){1to16}, %zmm9, %k1 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %ymm1, %ymm6, %ymm0 | |
| vpermt2ps %zmm21, %zmm4, %zmm0 | |
| vmovaps .LCPI0_125(%rip), %zmm4 # zmm4 = [0,1,2,3,4,5,6,7,8,9,21,u,u,u,u,u] | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovups (%rsi,%rax), %zmm2 {%k1} {z} | |
| .loc 1 222 51 is_stmt 0 # 03-matrix-multiplication-cpu.py:222:51 | |
| vpcmpgtd .LCPI0_7(%rip){1to16}, %zmm9, %k1 | |
| vmovups 1840(%rsp), %zmm9 # 64-byte Reload | |
| movq -48(%rsp), %rax # 8-byte Reload | |
| .loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm29, %zmm3, %zmm0 | |
| vmovaps .LCPI0_126(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,21,u,u,u,u] | |
| vpermt2ps %zmm23, %zmm4, %zmm0 | |
| vmovaps .LCPI0_127(%rip), %zmm4 # zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,21,u,u,u] | |
| vpermt2ps %zmm7, %zmm3, %zmm0 | |
| vmovaps .LCPI0_128(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,21,u,u] | |
| vpermt2ps %zmm11, %zmm4, %zmm0 | |
| vmovaps .LCPI0_129(%rip), %zmm4 # zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,21,u] | |
| vpermt2ps %zmm28, %zmm3, %zmm0 | |
| vmovaps .LCPI0_130(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,21] | |
| vpermt2ps %zmm12, %zmm4, %zmm0 | |
| vpermt2ps %zmm17, %zmm3, %zmm0 | |
| vshufps $255, %zmm2, %zmm2, %zmm3 # zmm3 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
| vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7] | |
| vfmadd231ps %zmm4, %zmm0, %zmm18 # zmm18 = (zmm0 * zmm4) + zmm18 | |
| vshufps $170, %zmm2, %zmm2, %zmm4 # zmm4 = zmm2[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
| vfmadd231ps %zmm3, %zmm0, %zmm16 # zmm16 = (zmm0 * zmm3) + zmm16 | |
| vshuff64x2 $170, %zmm4, %zmm4, %zmm5 # zmm5 = zmm4[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm4, %zmm4, %zmm3 # zmm3 = zmm4[6,7,6,7,6,7,6,7] | |
| vshufps $255, %xmm2, %xmm2, %xmm4 # xmm4 = xmm2[3,3,3,3] | |
| vfmadd231ps %zmm5, %zmm0, %zmm20 # zmm20 = (zmm0 * zmm5) + zmm20 | |
| vshufps $85, %zmm2, %zmm2, %zmm5 # zmm5 = zmm2[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
| vfmadd231ps %zmm3, %zmm0, %zmm22 # zmm22 = (zmm0 * zmm3) + zmm22 | |
| vshuff64x2 $170, %zmm5, %zmm5, %zmm6 # zmm6 = zmm5[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm5, %zmm5, %zmm3 # zmm3 = zmm5[6,7,6,7,6,7,6,7] | |
| vmovups %zmm16, 944(%rsp) # 64-byte Spill | |
| vmovaps 2032(%rsp), %xmm16 # 16-byte Reload | |
| vmovaps .LCPI0_132(%rip), %ymm5 # ymm5 = [0,1,2,3,4,14,u,u] | |
| vfmadd231ps %zmm6, %zmm0, %zmm27 # zmm27 = (zmm0 * zmm6) + zmm27 | |
| vextractf32x4 $2, %zmm2, %xmm6 | |
| vfmadd231ps %zmm3, %zmm0, %zmm19 # zmm19 = (zmm0 * zmm3) + zmm19 | |
| vextractf32x4 $3, %zmm2, %xmm3 | |
| vmovups %zmm20, 816(%rsp) # 64-byte Spill | |
| vmovups 1520(%rsp), %zmm20 # 64-byte Reload | |
| vbroadcastss %xmm6, %zmm6 | |
| vbroadcastss %xmm3, %zmm3 | |
| vfmadd231ps %zmm6, %zmm0, %zmm30 # zmm30 = (zmm0 * zmm6) + zmm30 | |
| vshufps $170, %xmm2, %xmm2, %xmm6 # xmm6 = xmm2[2,2,2,2] | |
| vfmadd231ps %zmm3, %zmm0, %zmm14 # zmm14 = (zmm0 * zmm3) + zmm14 | |
| vshufps $255, %ymm2, %ymm2, %ymm3 # ymm3 = ymm2[3,3,3,3,7,7,7,7] | |
| vbroadcastsd %xmm6, %zmm6 | |
| vshuff64x2 $85, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[2,3,2,3,2,3,2,3] | |
| vfmadd231ps %zmm6, %zmm0, %zmm31 # zmm31 = (zmm0 * zmm6) + zmm31 | |
| vmovshdup %xmm2, %xmm6 # xmm6 = xmm2[1,1,3,3] | |
| vfmadd231ps %zmm3, %zmm0, %zmm13 # zmm13 = (zmm0 * zmm3) + zmm13 | |
| vshufps $85, %ymm2, %ymm2, %ymm3 # ymm3 = ymm2[1,1,1,1,5,5,5,5] | |
| vbroadcastsd %xmm6, %zmm6 | |
| vshuff64x2 $85, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[2,3,2,3,2,3,2,3] | |
| vmovups %zmm14, 352(%rsp) # 64-byte Spill | |
| vmovaps %zmm11, %zmm14 | |
| vfmadd231ps %zmm6, %zmm0, %zmm20 # zmm20 = (zmm0 * zmm6) + zmm20 | |
| vbroadcastss %xmm2, %zmm6 | |
| vmovups %zmm13, 32(%rsp) # 64-byte Spill | |
| vmovups 1776(%rsp), %zmm13 # 64-byte Reload | |
| vfmadd231ps %zmm6, %zmm0, %zmm24 # zmm24 = (zmm0 * zmm6) + zmm24 | |
| vmovups 1072(%rsp), %zmm6 # 64-byte Reload | |
| vfmadd231ps %zmm3, %zmm0, %zmm6 # zmm6 = (zmm0 * zmm3) + zmm6 | |
| vbroadcastsd %xmm4, %zmm3 | |
| vfmadd231ps %zmm3, %zmm0, %zmm26 # zmm26 = (zmm0 * zmm3) + zmm26 | |
| vshufps $170, %ymm2, %ymm2, %ymm3 # ymm3 = ymm2[2,2,2,2,6,6,6,6] | |
| vextractf128 $1, %ymm2, %xmm2 | |
| vshuff64x2 $85, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[2,3,2,3,2,3,2,3] | |
| vbroadcastss %xmm2, %zmm2 | |
| vmovups %zmm26, 224(%rsp) # 64-byte Spill | |
| vmovups 160(%rsp), %zmm26 # 64-byte Reload | |
| vfmadd231ps %zmm2, %zmm0, %zmm25 # zmm25 = (zmm0 * zmm2) + zmm25 | |
| vmovapd .LCPI0_131(%rip), %ymm2 # ymm2 = [0,1,7,u] | |
| vfmadd231ps %zmm3, %zmm0, %zmm26 # zmm26 = (zmm0 * zmm3) + zmm26 | |
| vunpckhps %ymm13, %ymm9, %ymm0 # ymm0 = ymm9[2],ymm13[2],ymm9[3],ymm13[3],ymm9[6],ymm13[6],ymm9[7],ymm13[7] | |
| vextractf128 $1, %ymm0, %xmm0 | |
| vblendps $12, 880(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload | |
| # xmm0 = xmm0[0,1],mem[2,3] | |
| vinsertps $176, %xmm16, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm16[2] | |
| vpermt2pd %ymm15, %ymm2, %ymm0 | |
| vmovaps %zmm23, %zmm15 | |
| vpermt2ps %ymm10, %ymm5, %ymm0 | |
| vblendps $192, %ymm8, %ymm0, %ymm10 # ymm10 = ymm0[0,1,2,3,4,5],ymm8[6,7] | |
| vmovaps .LCPI0_133(%rip), %ymm0 # ymm0 = [0,1,2,3,4,5,6,14] | |
| vmovaps %zmm12, %zmm8 | |
| vpermt2ps %ymm1, %ymm0, %ymm10 | |
| vmovapd .LCPI0_134(%rip), %zmm0 # zmm0 = [0,1,2,3,11,u,u,u] | |
| vpermt2pd %zmm21, %zmm0, %zmm10 | |
| vmovaps .LCPI0_135(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,22,u,u,u,u,u,u] | |
| vmovups 816(%rsp), %zmm21 # 64-byte Reload | |
| vpermt2ps %zmm29, %zmm0, %zmm10 | |
| vmovapd .LCPI0_136(%rip), %zmm0 # zmm0 = [0,1,2,3,4,11,u,u] | |
| vpermt2pd %zmm23, %zmm0, %zmm10 | |
| vmovaps .LCPI0_137(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,9,10,22,u,u,u,u] | |
| vmovaps %zmm9, %zmm23 | |
| vpermt2ps %zmm7, %zmm0, %zmm10 | |
| vmovapd .LCPI0_138(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,11,u] | |
| vpermt2pd %zmm11, %zmm0, %zmm10 | |
| vmovaps .LCPI0_139(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,22,u,u] | |
| vmovups 352(%rsp), %zmm11 # 64-byte Reload | |
| vpermt2ps %zmm28, %zmm0, %zmm10 | |
| vmovapd .LCPI0_140(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,11] | |
| vmovaps %zmm17, %zmm28 | |
| vpermt2pd %zmm12, %zmm0, %zmm10 | |
| vmovaps .LCPI0_141(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,22] | |
| vpermt2ps %zmm17, %zmm0, %zmm10 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovups (%rsi,%rax), %zmm0 {%k1} {z} | |
| vmovaps %zmm27, %zmm17 | |
| vmovups 32(%rsp), %zmm27 # 64-byte Reload | |
| movq 8(%rsp), %rax # 8-byte Reload | |
| vbroadcastss %xmm0, %zmm2 | |
| vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3] | |
| vshufps $170, %xmm0, %xmm0, %xmm3 # xmm3 = xmm0[2,2,2,2] | |
| vextractf128 $1, %ymm0, %xmm7 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm10, %zmm24 # zmm24 = (zmm10 * zmm2) + zmm24 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm1, %zmm2 | |
| vextractf32x4 $2, %zmm0, %xmm1 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm10, %zmm20 # zmm20 = (zmm10 * zmm2) + zmm20 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm3, %zmm2 | |
| vmovups %zmm24, 1008(%rsp) # 64-byte Spill | |
| vmovaps %zmm31, %zmm24 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm10, %zmm24 # zmm24 = (zmm10 * zmm2) + zmm24 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm1, %zmm2 | |
| vshufps $255, %xmm0, %xmm0, %xmm1 # xmm1 = xmm0[3,3,3,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm10, %zmm30 # zmm30 = (zmm10 * zmm2) + zmm30 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %zmm0, %zmm0, %zmm2 # zmm2 = zmm0[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
| vmovups %zmm20, 1520(%rsp) # 64-byte Spill | |
| vmovups 224(%rsp), %zmm20 # 64-byte Reload | |
| vshuff64x2 $170, %zmm2, %zmm2, %zmm3 # zmm3 = zmm2[4,5,4,5,4,5,4,5] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm3, %zmm10, %zmm17 # zmm17 = (zmm10 * zmm3) + zmm17 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %zmm0, %zmm0, %zmm3 # zmm3 = zmm0[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
| vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5] | |
| vmovaps %zmm30, %zmm31 | |
| vmovaps %zmm6, %zmm30 | |
| vmovaps %zmm13, %zmm6 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm10, %zmm21 # zmm21 = (zmm10 * zmm4) + zmm21 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %zmm0, %zmm0, %zmm4 # zmm4 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
| vshuff64x2 $170, %zmm4, %zmm4, %zmm5 # zmm5 = zmm4[4,5,4,5,4,5,4,5] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm5, %zmm10, %zmm18 # zmm18 = (zmm10 * zmm5) + zmm18 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm7, %zmm5 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vunpcklps %zmm13, %zmm9, %zmm7 # zmm7 = zmm9[0],zmm13[0],zmm9[1],zmm13[1],zmm9[4],zmm13[4],zmm9[5],zmm13[5],zmm9[8],zmm13[8],zmm9[9],zmm13[9],zmm9[12],zmm13[12],zmm9[13],zmm13[13] | |
| vfmadd231ps %zmm5, %zmm10, %zmm25 # zmm25 = (zmm10 * zmm5) + zmm25 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %ymm0, %ymm0, %ymm5 # ymm5 = ymm0[2,2,2,2,6,6,6,6] | |
| vshuff64x2 $85, %zmm5, %zmm5, %zmm5 # zmm5 = zmm5[2,3,2,3,2,3,2,3] | |
| vmovaps %zmm18, %zmm29 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vunpckhps %zmm13, %zmm9, %zmm18 # zmm18 = zmm9[2],zmm13[2],zmm9[3],zmm13[3],zmm9[6],zmm13[6],zmm9[7],zmm13[7],zmm9[10],zmm13[10],zmm9[11],zmm13[11],zmm9[14],zmm13[14],zmm9[15],zmm13[15] | |
| vfmadd231ps %zmm5, %zmm10, %zmm26 # zmm26 = (zmm10 * zmm5) + zmm26 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm1, %zmm5 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vmovaps %zmm9, %zmm1 | |
| vmovups %zmm25, 1456(%rsp) # 64-byte Spill | |
| vmovups 1584(%rsp), %zmm25 # 64-byte Reload | |
| vfmadd231ps %zmm5, %zmm10, %zmm20 # zmm20 = (zmm10 * zmm5) + zmm20 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %ymm0, %ymm0, %ymm5 # ymm5 = ymm0[1,1,1,1,5,5,5,5] | |
| vshuff64x2 $85, %zmm5, %zmm5, %zmm5 # zmm5 = zmm5[2,3,2,3,2,3,2,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm5, %zmm10, %zmm30 # zmm30 = (zmm10 * zmm5) + zmm30 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %ymm0, %ymm0, %ymm5 # ymm5 = ymm0[3,3,3,3,7,7,7,7] | |
| vextractf32x4 $3, %zmm0, %xmm0 | |
| vshuff64x2 $85, %zmm5, %zmm5, %zmm5 # zmm5 = zmm5[2,3,2,3,2,3,2,3] | |
| vbroadcastss %xmm0, %zmm0 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm5, %zmm10, %zmm27 # zmm27 = (zmm10 * zmm5) + zmm27 | |
| vfmadd231ps %zmm0, %zmm10, %zmm11 # zmm11 = (zmm10 * zmm0) + zmm11 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $255, %zmm2, %zmm2, %zmm0 # zmm0 = zmm2[6,7,6,7,6,7,6,7] | |
| vmovups 752(%rsp), %ymm2 # 32-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm10, %zmm19 # zmm19 = (zmm10 * zmm0) + zmm19 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $255, %zmm3, %zmm3, %zmm0 # zmm0 = zmm3[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vmovaps .LCPI0_144(%rip), %ymm3 # ymm3 = [0,1,2,3,4,5,15,u] | |
| vfmadd231ps %zmm0, %zmm10, %zmm22 # zmm22 = (zmm10 * zmm0) + zmm22 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $255, %zmm4, %zmm4, %zmm0 # zmm0 = zmm4[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd213ps 944(%rsp), %zmm0, %zmm10 # 64-byte Folded Reload | |
| # zmm10 = (zmm0 * zmm10) + mem | |
| vmovaps .LCPI0_143(%rip), %ymm4 # ymm4 = [0,1,2,3,4,15,u,u] | |
| vpermt2ps %ymm13, %ymm2, %ymm9 | |
| vmovaps %xmm16, %xmm13 | |
| vmovaps .LCPI0_142(%rip), %ymm2 # ymm2 = [0,1,2,3,15,u,u,u] | |
| vmovups 1648(%rsp), %zmm16 # 64-byte Reload | |
| vextractf128 $1, %ymm9, %xmm0 | |
| vshufps $244, 880(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload | |
| # xmm0 = xmm0[0,1],mem[3,3] | |
| vmovdqu64 1200(%rsp), %zmm9 # 64-byte Reload | |
| .loc 1 222 51 # 03-matrix-multiplication-cpu.py:222:51 | |
| vpcmpgtd .LCPI0_105(%rip){1to16}, %zmm9, %k1 | |
| vpcmpgtd .LCPI0_2(%rip){1to16}, %zmm9, %k2 | |
| vpcmpgtd .LCPI0_3(%rip){1to16}, %zmm9, %k5 | |
| vpcmpgtd .LCPI0_4(%rip){1to16}, %zmm9, %k6 | |
| vpcmpgtd .LCPI0_5(%rip){1to16}, %zmm9, %k7 | |
| vpcmpgtd .LCPI0_6(%rip){1to16}, %zmm9, %k3 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vblendps $8, %xmm13, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm13[3] | |
| vmovups 1264(%rsp), %zmm13 # 64-byte Reload | |
| vpermt2ps %ymm13, %ymm2, %ymm0 | |
| vpermt2ps 688(%rsp), %ymm4, %ymm0 # 32-byte Folded Reload | |
| vmovaps .LCPI0_146(%rip), %zmm2 # zmm2 = [0,1,2,3,4,5,6,7,8,23,u,u,u,u,u,u] | |
| vpermt2ps 96(%rsp), %ymm3, %ymm0 # 32-byte Folded Reload | |
| vmovaps .LCPI0_145(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,23,u,u,u,u,u,u,u] | |
| vblendps $128, 624(%rsp), %ymm0, %ymm12 # 32-byte Folded Reload | |
| # ymm12 = ymm0[0,1,2,3,4,5,6],mem[7] | |
| vmovaps .LCPI0_149(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,23,u,u,u] | |
| vpermt2ps %zmm16, %zmm3, %zmm12 | |
| vmovaps .LCPI0_147(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,23,u,u,u,u,u] | |
| vpermt2ps %zmm25, %zmm2, %zmm12 | |
| vmovaps .LCPI0_148(%rip), %zmm2 # zmm2 = [0,1,2,3,4,5,6,7,8,9,10,23,u,u,u,u] | |
| vpermt2ps %zmm15, %zmm3, %zmm12 | |
| vmovaps .LCPI0_150(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,23,u,u] | |
| vmovups 1520(%rsp), %zmm15 # 64-byte Reload | |
| vpermt2ps 1136(%rsp), %zmm2, %zmm12 # 64-byte Folded Reload | |
| vmovaps .LCPI0_152(%rip), %zmm2 # zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,23] | |
| vpermt2ps %zmm14, %zmm0, %zmm12 | |
| vpermt2ps 288(%rsp), %zmm3, %zmm12 # 64-byte Folded Reload | |
| vmovaps .LCPI0_151(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,23,u] | |
| vmovups 1008(%rsp), %zmm14 # 64-byte Reload | |
| vpermt2ps %zmm8, %zmm0, %zmm12 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovups (%rsi,%rax), %zmm0 {%k1} {z} | |
| .loc 1 222 51 is_stmt 0 # 03-matrix-multiplication-cpu.py:222:51 | |
| vpcmpgtd .LCPI0_103(%rip){1to16}, %zmm9, %k1 | |
| vmovaps %zmm31, %zmm8 | |
| vmovups 1136(%rsp), %zmm31 # 64-byte Reload | |
| movq -40(%rsp), %rax # 8-byte Reload | |
| .loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm28, %zmm2, %zmm12 | |
| vmovupd 1328(%rsp), %zmm28 # 64-byte Reload | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %zmm0, %zmm0, %zmm2 # zmm2 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
| kmovw %k1, 1008(%rsp) # 2-byte Spill | |
| .loc 1 222 51 is_stmt 0 # 03-matrix-multiplication-cpu.py:222:51 | |
| vpcmpgtd .LCPI0_1(%rip){1to16}, %zmm9, %k1 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $170, %zmm2, %zmm2, %zmm3 # zmm3 = zmm2[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm3, %zmm12, %zmm29 # zmm29 = (zmm12 * zmm3) + zmm29 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %zmm0, %zmm0, %zmm3 # zmm3 = zmm0[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm12, %zmm10 # zmm10 = (zmm12 * zmm2) + zmm10 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm3, %zmm3, %zmm2 # zmm2 = zmm3[6,7,6,7,6,7,6,7] | |
| vextractf32x4 $3, %zmm0, %xmm3 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm12, %zmm21 # zmm21 = (zmm12 * zmm4) + zmm21 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %zmm0, %zmm0, %zmm4 # zmm4 = zmm0[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm12, %zmm22 # zmm22 = (zmm12 * zmm2) + zmm22 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $255, %zmm4, %zmm4, %zmm2 # zmm2 = zmm4[6,7,6,7,6,7,6,7] | |
| vshuff64x2 $170, %zmm4, %zmm4, %zmm5 # zmm5 = zmm4[4,5,4,5,4,5,4,5] | |
| vshufps $255, %xmm0, %xmm0, %xmm4 # xmm4 = xmm0[3,3,3,3] | |
| kmovw %k1, 880(%rsp) # 2-byte Spill | |
| .loc 1 222 51 is_stmt 0 # 03-matrix-multiplication-cpu.py:222:51 | |
| vpcmpgtd .LCPI0_104(%rip){1to16}, %zmm9, %k1 | |
| .loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm12, %zmm19 # zmm19 = (zmm12 * zmm2) + zmm19 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm3, %zmm2 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vmovaps .LCPI0_18(%rip), %zmm3 # zmm3 = [1,17,2,3,5,21,6,7,9,25,10,11,13,29,14,15] | |
| vfmadd231ps %zmm5, %zmm12, %zmm17 # zmm17 = (zmm12 * zmm5) + zmm17 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vextractf32x4 $2, %zmm0, %xmm5 | |
| vmovups %zmm21, 816(%rsp) # 64-byte Spill | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm12, %zmm11 # zmm11 = (zmm12 * zmm2) + zmm11 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %ymm0, %ymm0, %ymm2 # ymm2 = ymm0[3,3,3,3,7,7,7,7] | |
| vbroadcastss %xmm5, %zmm5 | |
| vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3] | |
| vmovaps %zmm17, %zmm21 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vmovaps %zmm7, %zmm17 | |
| vfmadd231ps %zmm5, %zmm12, %zmm8 # zmm8 = (zmm12 * zmm5) + zmm8 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %xmm0, %xmm0, %xmm5 # xmm5 = xmm0[2,2,2,2] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm12, %zmm27 # zmm27 = (zmm12 * zmm2) + zmm27 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %ymm0, %ymm0, %ymm2 # ymm2 = ymm0[1,1,1,1,5,5,5,5] | |
| vbroadcastsd %xmm5, %zmm5 | |
| vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3] | |
| vmovups %zmm11, 352(%rsp) # 64-byte Spill | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm5, %zmm12, %zmm24 # zmm24 = (zmm12 * zmm5) + zmm24 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovshdup %xmm0, %xmm5 # xmm5 = xmm0[1,1,3,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm12, %zmm30 # zmm30 = (zmm12 * zmm2) + zmm30 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm4, %zmm2 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm6, %zmm3, %zmm1 | |
| vmovups 1968(%rsp), %zmm3 # 64-byte Reload | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm5, %zmm5 | |
| vmovups %zmm27, 32(%rsp) # 64-byte Spill | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm12, %zmm20 # zmm20 = (zmm12 * zmm2) + zmm20 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %ymm0, %ymm0, %ymm2 # ymm2 = ymm0[2,2,2,2,6,6,6,6] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm5, %zmm12, %zmm15 # zmm15 = (zmm12 * zmm5) + zmm15 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm0, %zmm5 | |
| vextractf128 $1, %ymm0, %xmm0 | |
| vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm5, %zmm12, %zmm14 # zmm14 = (zmm12 * zmm5) + zmm14 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm0, %zmm0 | |
| vmovups 1904(%rsp), %zmm5 # 64-byte Reload | |
| vmovups %zmm30, 1072(%rsp) # 64-byte Spill | |
| vmovupd 1712(%rsp), %zmm30 # 64-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm12, %zmm26 # zmm26 = (zmm12 * zmm2) + zmm26 | |
| vmovaps .LCPI0_15(%rip), %zmm2 # zmm2 = [3,19,2,3,7,23,6,7,11,27,10,11,15,31,14,15] | |
| vmovaps %zmm1, %zmm11 | |
| vfmadd213ps 1456(%rsp), %zmm0, %zmm12 # 64-byte Folded Reload | |
| # zmm12 = (zmm0 * zmm12) + mem | |
| vmovaps .LCPI0_34(%rip), %xmm0 # xmm0 = [8,9,25,u] | |
| vmovaps %zmm26, %zmm27 | |
| vmovups 288(%rsp), %zmm26 # 64-byte Reload | |
| vpermt2ps %zmm6, %zmm2, %zmm23 | |
| vmovapd .LCPI0_58(%rip), %xmm2 # xmm2 = [6,14] | |
| vmovaps %zmm18, %zmm6 | |
| vpermt2ps %zmm3, %zmm0, %zmm11 | |
| vmovapd .LCPI0_41(%rip), %xmm0 # xmm0 = [4,13] | |
| vmovaps %zmm23, %zmm9 | |
| vpermt2pd %zmm3, %zmm2, %zmm17 | |
| vmovaps .LCPI0_67(%rip), %xmm2 # xmm2 = [12,13,29,u] | |
| vpermt2pd %zmm3, %zmm0, %zmm6 | |
| vmovaps .LCPI0_49(%rip), %xmm0 # xmm0 = [8,9,27,u] | |
| vpermt2ps %zmm3, %zmm2, %zmm1 | |
| vmovapd .LCPI0_78(%rip), %xmm2 # xmm2 = [6,15] | |
| vpermt2ps %zmm3, %zmm0, %zmm9 | |
| vmovaps .LCPI0_73(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,29,u,u,u,u,u,u,u] | |
| vpermt2pd %zmm3, %zmm2, %zmm18 | |
| vmovaps .LCPI0_90(%rip), %xmm2 # xmm2 = [12,13,31,u] | |
| vpermt2ps %zmm3, %zmm2, %zmm23 | |
| vmovapd .LCPI0_153(%rip), %xmm2 # xmm2 = [4,12] | |
| vpermt2pd %zmm3, %zmm2, %zmm7 | |
| vmovaps .LCPI0_35(%rip), %xmm2 # xmm2 = [0,1,2,25] | |
| vmovups 688(%rsp), %zmm3 # 64-byte Reload | |
| vpermt2ps %zmm5, %zmm2, %zmm11 | |
| vmovaps .LCPI0_42(%rip), %xmm2 # xmm2 = [0,1,2,26] | |
| vpermt2ps %zmm5, %zmm2, %zmm6 | |
| vmovaps .LCPI0_59(%rip), %xmm2 # xmm2 = [0,1,2,28] | |
| vpermt2ps %zmm5, %zmm2, %zmm17 | |
| vmovaps .LCPI0_68(%rip), %xmm2 # xmm2 = [0,1,2,29] | |
| vpermt2ps %zmm5, %zmm2, %zmm1 | |
| vmovaps .LCPI0_79(%rip), %xmm2 # xmm2 = [0,1,2,30] | |
| vpermt2ps %zmm5, %zmm2, %zmm18 | |
| vmovaps .LCPI0_91(%rip), %xmm2 # xmm2 = [0,1,2,31] | |
| vpermt2ps %zmm5, %zmm2, %zmm23 | |
| vmovaps .LCPI0_50(%rip), %xmm2 # xmm2 = [0,1,2,27] | |
| vpermt2ps %zmm5, %zmm2, %zmm9 | |
| vmovaps .LCPI0_154(%rip), %xmm2 # xmm2 = [0,1,2,24] | |
| vpermt2ps %zmm5, %zmm2, %zmm7 | |
| vmovaps .LCPI0_36(%rip), %ymm2 # ymm2 = [0,1,2,3,25,u,u,u] | |
| vmovups 624(%rsp), %zmm5 # 64-byte Reload | |
| vpermt2ps %zmm13, %zmm2, %zmm11 | |
| vmovapd .LCPI0_43(%rip), %ymm2 # ymm2 = [0,1,13,u] | |
| vpermt2pd %zmm13, %zmm2, %zmm6 | |
| vmovapd .LCPI0_60(%rip), %ymm2 # ymm2 = [0,1,14,u] | |
| vpermt2pd %zmm13, %zmm2, %zmm17 | |
| vmovaps .LCPI0_69(%rip), %ymm2 # ymm2 = [0,1,2,3,29,u,u,u] | |
| vpermt2ps %zmm13, %zmm2, %zmm1 | |
| vmovaps .LCPI0_51(%rip), %ymm2 # ymm2 = [0,1,2,3,27,u,u,u] | |
| vpermt2ps %zmm13, %zmm2, %zmm9 | |
| vmovapd .LCPI0_80(%rip), %ymm2 # ymm2 = [0,1,15,u] | |
| vpermt2pd %zmm13, %zmm2, %zmm18 | |
| vmovaps .LCPI0_92(%rip), %ymm2 # ymm2 = [0,1,2,3,31,u,u,u] | |
| vpermt2ps %zmm13, %zmm2, %zmm23 | |
| vmovapd .LCPI0_155(%rip), %ymm2 # ymm2 = [0,1,12,u] | |
| vpermt2pd %zmm13, %zmm2, %zmm7 | |
| vmovaps .LCPI0_37(%rip), %ymm2 # ymm2 = [0,1,2,3,4,25,u,u] | |
| vmovups 96(%rsp), %zmm13 # 64-byte Reload | |
| vpermt2ps %zmm3, %zmm2, %zmm11 | |
| vmovaps .LCPI0_44(%rip), %ymm2 # ymm2 = [0,1,2,3,4,26,u,u] | |
| vpermt2ps %zmm3, %zmm2, %zmm6 | |
| vmovaps .LCPI0_52(%rip), %ymm2 # ymm2 = [0,1,2,3,4,27,u,u] | |
| vpermt2ps %zmm3, %zmm2, %zmm9 | |
| vmovaps .LCPI0_61(%rip), %ymm2 # ymm2 = [0,1,2,3,4,28,u,u] | |
| vpermt2ps %zmm3, %zmm2, %zmm17 | |
| vmovaps .LCPI0_70(%rip), %ymm2 # ymm2 = [0,1,2,3,4,29,u,u] | |
| vpermt2ps %zmm3, %zmm2, %zmm1 | |
| vmovaps .LCPI0_81(%rip), %ymm2 # ymm2 = [0,1,2,3,4,30,u,u] | |
| vpermt2ps %zmm3, %zmm2, %zmm18 | |
| vmovaps .LCPI0_93(%rip), %ymm2 # ymm2 = [0,1,2,3,4,31,u,u] | |
| vpermt2ps %zmm3, %zmm2, %zmm23 | |
| vmovaps .LCPI0_156(%rip), %ymm2 # ymm2 = [0,1,2,3,4,24,u,u] | |
| vpermt2ps %zmm3, %zmm2, %zmm7 | |
| vmovaps .LCPI0_38(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,25,u] | |
| vmovapd .LCPI0_47(%rip), %zmm3 # zmm3 = [0,1,2,3,13,u,u,u] | |
| vpermt2ps %zmm13, %zmm2, %zmm11 | |
| vmovapd .LCPI0_45(%rip), %ymm2 # ymm2 = [0,1,2,13] | |
| vpermt2pd %zmm13, %zmm2, %zmm6 | |
| vmovapd .LCPI0_62(%rip), %ymm2 # ymm2 = [0,1,2,14] | |
| vpermt2pd %zmm13, %zmm2, %zmm17 | |
| vmovaps .LCPI0_71(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,29,u] | |
| vpermt2ps %zmm13, %zmm2, %zmm1 | |
| vmovapd .LCPI0_82(%rip), %ymm2 # ymm2 = [0,1,2,15] | |
| vpermt2pd %zmm13, %zmm2, %zmm18 | |
| vmovaps .LCPI0_94(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,31,u] | |
| vpermt2ps %zmm13, %zmm2, %zmm23 | |
| vmovaps .LCPI0_53(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,27,u] | |
| vpermt2ps %zmm13, %zmm2, %zmm9 | |
| vmovapd .LCPI0_157(%rip), %ymm2 # ymm2 = [0,1,2,12] | |
| vpermt2pd %zmm13, %zmm2, %zmm7 | |
| vmovaps .LCPI0_39(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,25] | |
| vpermt2ps %zmm5, %zmm2, %zmm11 | |
| vmovaps .LCPI0_46(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,26] | |
| vpermt2ps %zmm5, %zmm2, %zmm6 | |
| vmovaps .LCPI0_63(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,28] | |
| vpermt2pd %zmm16, %zmm3, %zmm6 | |
| vmovaps .LCPI0_48(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,26,u,u,u,u,u,u] | |
| vpermt2ps %zmm5, %zmm2, %zmm17 | |
| vmovaps .LCPI0_72(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,29] | |
| vpermt2ps %zmm25, %zmm3, %zmm6 | |
| vmovaps .LCPI0_74(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,29,u,u,u,u,u,u] | |
| vpermt2ps %zmm5, %zmm2, %zmm1 | |
| vmovaps .LCPI0_54(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,27] | |
| vpermt2ps %zmm16, %zmm0, %zmm1 | |
| vmovapd .LCPI0_84(%rip), %zmm0 # zmm0 = [0,1,2,3,15,u,u,u] | |
| vpermt2ps %zmm5, %zmm2, %zmm9 | |
| vmovaps .LCPI0_83(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,30] | |
| vpermt2ps %zmm25, %zmm3, %zmm1 | |
| vmovaps .LCPI0_56(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,27,u,u,u,u,u,u] | |
| vpermt2ps %zmm5, %zmm2, %zmm18 | |
| vmovaps .LCPI0_95(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,31] | |
| vpermt2pd %zmm16, %zmm0, %zmm18 | |
| vmovaps .LCPI0_96(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,31,u,u,u,u,u,u,u] | |
| vpermt2ps %zmm5, %zmm2, %zmm23 | |
| vmovaps .LCPI0_158(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,24] | |
| vpermt2ps %zmm16, %zmm0, %zmm23 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovups (%rsi,%rax), %zmm0 {%k3} {z} | |
| kmovw -114(%rsp), %k3 # 2-byte Reload | |
| movq 16(%rsp), %rax # 8-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm5, %zmm2, %zmm7 | |
| vmovaps .LCPI0_40(%rip), %zmm5 # zmm5 = [0,1,2,3,4,5,6,7,25,u,u,u,u,u,u,u] | |
| vshuff64x2 $228, %zmm16, %zmm7, %zmm13 # zmm13 = zmm7[0,1,2,3],zmm16[4,5,6,7] | |
| vmovaps .LCPI0_64(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,28,u,u,u,u,u,u] | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm0, %zmm2 | |
| vshufps $170, %xmm0, %xmm0, %xmm4 # xmm4 = xmm0[2,2,2,2] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm16, %zmm5, %zmm11 | |
| vmovaps .LCPI0_55(%rip), %zmm5 # zmm5 = [0,1,2,3,4,5,6,7,27,u,u,u,u,u,u,u] | |
| .loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
| vmovaps %zmm25, %zmm11 {%k3} | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm16, %zmm5, %zmm9 | |
| vshuff64x2 $244, %zmm16, %zmm17, %zmm5 # zmm5 = zmm17[0,1,2,3],zmm16[6,7,6,7] | |
| vmovaps .LCPI0_161(%rip), %zmm16 # zmm16 = [0,1,2,3,4,5,6,7,8,9,10,24,u,u,u,u] | |
| vmovups 560(%rsp), %zmm17 # 64-byte Reload | |
| vpermt2ps %zmm25, %zmm7, %zmm5 | |
| vmovaps .LCPI0_85(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,30,u,u,u,u,u,u] | |
| vpermt2ps %zmm25, %zmm3, %zmm9 | |
| vmovaps .LCPI0_159(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,24,u,u,u,u,u,u] | |
| vpermt2ps %zmm25, %zmm7, %zmm18 | |
| vmovaps .LCPI0_97(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,31,u,u,u,u,u,u] | |
| vpermt2ps %zmm25, %zmm3, %zmm13 | |
| vmovapd .LCPI0_160(%rip), %zmm3 # zmm3 = [0,1,2,3,4,12,u,u] | |
| vpermt2ps %zmm25, %zmm7, %zmm23 | |
| vmovapd .LCPI0_162(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,12,u] | |
| vmovupd 416(%rsp), %zmm25 # 64-byte Reload | |
| vpermt2pd %zmm30, %zmm3, %zmm13 | |
| vmovaps .LCPI0_165(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,24] | |
| vpermt2ps %zmm31, %zmm16, %zmm13 | |
| vmovaps .LCPI0_163(%rip), %zmm16 # zmm16 = [0,1,2,3,4,5,6,7,8,9,10,11,12,24,u,u] | |
| vpermt2pd %zmm28, %zmm7, %zmm13 | |
| vmovapd .LCPI0_164(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,12] | |
| vpermt2ps %zmm26, %zmm16, %zmm13 | |
| vmovaps %zmm21, %zmm16 | |
| vmovups 816(%rsp), %zmm21 # 64-byte Reload | |
| vpermt2pd %zmm25, %zmm7, %zmm13 | |
| vpermt2ps %zmm17, %zmm3, %zmm13 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovshdup %xmm0, %xmm3 # xmm3 = xmm0[1,1,3,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm13, %zmm14 # zmm14 = (zmm13 * zmm2) + zmm14 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm3, %zmm2 | |
| vextractf32x4 $2, %zmm0, %xmm3 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm13, %zmm15 # zmm15 = (zmm13 * zmm2) + zmm15 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm4, %zmm2 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm13, %zmm24 # zmm24 = (zmm13 * zmm2) + zmm24 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm3, %zmm2 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm13, %zmm8 # zmm8 = (zmm13 * zmm2) + zmm8 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %zmm0, %zmm0, %zmm2 # zmm2 = zmm0[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
| vshuff64x2 $170, %zmm2, %zmm2, %zmm3 # zmm3 = zmm2[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm3, %zmm13, %zmm16 # zmm16 = (zmm13 * zmm3) + zmm16 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %zmm0, %zmm0, %zmm3 # zmm3 = zmm0[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm13, %zmm19 # zmm19 = (zmm13 * zmm2) + zmm19 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm13, %zmm21 # zmm21 = (zmm13 * zmm4) + zmm21 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %zmm0, %zmm0, %zmm4 # zmm4 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
| vshuff64x2 $170, %zmm4, %zmm4, %zmm7 # zmm7 = zmm4[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm7, %zmm13, %zmm29 # zmm29 = (zmm13 * zmm7) + zmm29 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vextractf128 $1, %ymm0, %xmm7 | |
| vbroadcastss %xmm7, %zmm7 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm7, %zmm13, %zmm12 # zmm12 = (zmm13 * zmm7) + zmm12 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %ymm0, %ymm0, %ymm7 # ymm7 = ymm0[2,2,2,2,6,6,6,6] | |
| vshuff64x2 $85, %zmm7, %zmm7, %zmm7 # zmm7 = zmm7[2,3,2,3,2,3,2,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm7, %zmm13, %zmm27 # zmm27 = (zmm13 * zmm7) + zmm27 | |
| vmovups %zmm27, 160(%rsp) # 64-byte Spill | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %xmm0, %xmm0, %xmm27 # xmm27 = xmm0[3,3,3,3] | |
| vbroadcastsd %xmm27, %zmm7 | |
| vmovups 32(%rsp), %zmm27 # 64-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm7, %zmm13, %zmm20 # zmm20 = (zmm13 * zmm7) + zmm20 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %ymm0, %ymm0, %ymm7 # ymm7 = ymm0[1,1,1,1,5,5,5,5] | |
| vshuff64x2 $85, %zmm7, %zmm7, %zmm7 # zmm7 = zmm7[2,3,2,3,2,3,2,3] | |
| vmovups %zmm20, 224(%rsp) # 64-byte Spill | |
| vmovups 1072(%rsp), %zmm20 # 64-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm7, %zmm13, %zmm20 # zmm20 = (zmm13 * zmm7) + zmm20 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %ymm0, %ymm0, %ymm7 # ymm7 = ymm0[3,3,3,3,7,7,7,7] | |
| vextractf32x4 $3, %zmm0, %xmm0 | |
| vshuff64x2 $85, %zmm7, %zmm7, %zmm7 # zmm7 = zmm7[2,3,2,3,2,3,2,3] | |
| vbroadcastss %xmm0, %zmm0 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm7, %zmm13, %zmm27 # zmm27 = (zmm13 * zmm7) + zmm27 | |
| vmovapd .LCPI0_65(%rip), %zmm7 # zmm7 = [0,1,2,3,4,14,u,u] | |
| vmovups %zmm27, 32(%rsp) # 64-byte Spill | |
| vmovups 352(%rsp), %zmm27 # 64-byte Reload | |
| vpermt2pd %zmm30, %zmm7, %zmm5 | |
| vmovapd .LCPI0_86(%rip), %zmm7 # zmm7 = [0,1,2,3,4,15,u,u] | |
| vfmadd231ps %zmm0, %zmm13, %zmm27 # zmm27 = (zmm13 * zmm0) + zmm27 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $255, %zmm3, %zmm3, %zmm0 # zmm0 = zmm3[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vmovaps .LCPI0_75(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,29,u,u,u,u,u] | |
| vfmadd231ps %zmm0, %zmm13, %zmm22 # zmm22 = (zmm13 * zmm0) + zmm22 | |
| vfmadd213ps %zmm10, %zmm4, %zmm13 # zmm13 = (zmm4 * zmm13) + zmm10 | |
| vmovaps .LCPI0_166(%rip), %zmm10 # zmm10 = [0,1,2,3,4,5,6,7,8,9,25,u,u,u,u,u] | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovups (%rsi,%rax), %zmm0 {%k1} {z} | |
| kmovw -116(%rsp), %k1 # 2-byte Reload | |
| movq -24(%rsp), %rax # 8-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2pd %zmm30, %zmm7, %zmm18 | |
| vmovaps .LCPI0_167(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,9,10,25,u,u,u,u] | |
| .loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
| vmovapd %zmm30, %zmm6 {%k1} | |
| kmovw -118(%rsp), %k1 # 2-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm30, %zmm3, %zmm1 | |
| vmovaps .LCPI0_98(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,31,u,u,u,u,u] | |
| vpermt2ps %zmm30, %zmm10, %zmm11 | |
| vmovaps .LCPI0_169(%rip), %zmm10 # zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,25,u,u] | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %zmm0, %zmm0, %zmm2 # zmm2 = zmm0[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm31, %zmm7, %zmm11 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $170, %zmm2, %zmm2, %zmm7 # zmm7 = zmm2[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm30, %zmm3, %zmm23 | |
| vmovaps .LCPI0_57(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,27,u,u,u,u,u] | |
| vpermt2ps %zmm30, %zmm3, %zmm9 | |
| vmovaps .LCPI0_168(%rip), %zmm30 # zmm30 = [0,1,2,3,4,5,6,7,8,9,10,11,25,u,u,u] | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %zmm0, %zmm0, %zmm3 # zmm3 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
| vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm28, %zmm30, %zmm11 | |
| vpermt2ps %zmm26, %zmm10, %zmm11 | |
| vmovaps .LCPI0_170(%rip), %zmm26 # zmm26 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,25,u] | |
| vmovaps .LCPI0_171(%rip), %zmm10 # zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,25] | |
| vpermt2ps %zmm25, %zmm26, %zmm11 | |
| vpermt2ps %zmm17, %zmm10, %zmm11 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovups (%rsi,%rax), %zmm10 {%k7} {z} | |
| movq 544(%rsp), %rax # 8-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm3, %zmm11, %zmm13 # zmm13 = (zmm11 * zmm3) + zmm13 | |
| vmovaps .LCPI0_66(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,28,u,u,u,u] | |
| vfmadd231ps %zmm4, %zmm11, %zmm29 # zmm29 = (zmm11 * zmm4) + zmm29 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %zmm0, %zmm0, %zmm4 # zmm4 = zmm0[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm7, %zmm11, %zmm21 # zmm21 = (zmm11 * zmm7) + zmm21 | |
| vfmadd231ps %zmm2, %zmm11, %zmm22 # zmm22 = (zmm11 * zmm2) + zmm22 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $170, %zmm4, %zmm4, %zmm7 # zmm7 = zmm4[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm4, %zmm4, %zmm2 # zmm2 = zmm4[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm7, %zmm11, %zmm16 # zmm16 = (zmm11 * zmm7) + zmm16 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vextractf32x4 $2, %zmm0, %xmm7 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm11, %zmm19 # zmm19 = (zmm11 * zmm2) + zmm19 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vextractf32x4 $3, %zmm0, %xmm2 | |
| vbroadcastss %xmm10, %zmm4 | |
| vmovups %zmm29, 96(%rsp) # 64-byte Spill | |
| vbroadcastss %xmm7, %zmm7 | |
| vbroadcastss %xmm2, %zmm2 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm7, %zmm11, %zmm8 # zmm8 = (zmm11 * zmm7) + zmm8 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %xmm0, %xmm0, %xmm7 # xmm7 = xmm0[2,2,2,2] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm11, %zmm27 # zmm27 = (zmm11 * zmm2) + zmm27 | |
| vmovaps %zmm16, %zmm29 | |
| vmovups 32(%rsp), %zmm16 # 64-byte Reload | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %ymm0, %ymm0, %ymm2 # ymm2 = ymm0[3,3,3,3,7,7,7,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm31, %zmm3, %zmm5 | |
| vmovaps .LCPI0_87(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,30,u,u,u,u] | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm7, %zmm7 | |
| vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm7, %zmm11, %zmm24 # zmm24 = (zmm11 * zmm7) + zmm24 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovshdup %xmm0, %xmm7 # xmm7 = xmm0[1,1,3,3] | |
| vbroadcastsd %xmm7, %zmm7 | |
| vmovups %zmm27, 352(%rsp) # 64-byte Spill | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vmovaps .LCPI0_172(%rip), %zmm27 # zmm27 = [0,1,2,3,4,5,6,7,8,9,10,26,u,u,u,u] | |
| vmovups %zmm8, 944(%rsp) # 64-byte Spill | |
| vmovapd %zmm28, %zmm8 | |
| vfmadd231ps %zmm7, %zmm11, %zmm15 # zmm15 = (zmm11 * zmm7) + zmm15 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm0, %zmm7 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm7, %zmm11, %zmm14 # zmm14 = (zmm11 * zmm7) + zmm14 | |
| vmovaps .LCPI0_76(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,9,10,29,u,u,u,u] | |
| vfmadd231ps %zmm2, %zmm11, %zmm16 # zmm16 = (zmm11 * zmm2) + zmm16 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %ymm0, %ymm0, %ymm2 # ymm2 = ymm0[1,1,1,1,5,5,5,5] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm31, %zmm3, %zmm18 | |
| vmovaps .LCPI0_99(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,31,u,u,u,u] | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3] | |
| vmovaps %zmm15, %zmm26 | |
| vmovups (%rsi,%rax), %zmm15 {%k6} {z} | |
| movq -16(%rsp), %rax # 8-byte Reload | |
| vmovaps %zmm14, %zmm30 | |
| vmovups 160(%rsp), %zmm14 # 64-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm31, %zmm7, %zmm1 | |
| vmovaps %zmm31, %zmm7 | |
| vpermt2ps %zmm7, %zmm27, %zmm6 | |
| .loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
| vmovaps %zmm7, %zmm9 {%k1} | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vmovapd .LCPI0_173(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,13,u] | |
| vmovapd .LCPI0_175(%rip), %zmm27 # zmm27 = [0,1,2,3,4,5,6,13] | |
| vpermt2ps %zmm31, %zmm3, %zmm23 | |
| vmovaps .LCPI0_100(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,31,u,u,u] | |
| vmovaps %zmm20, %zmm31 | |
| vmovups 224(%rsp), %zmm20 # 64-byte Reload | |
| vfmadd231ps %zmm2, %zmm11, %zmm31 # zmm31 = (zmm11 * zmm2) + zmm31 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %xmm0, %xmm0, %xmm2 # xmm2 = xmm0[3,3,3,3] | |
| vbroadcastsd %xmm2, %zmm2 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2pd %zmm8, %zmm7, %zmm6 | |
| vmovaps .LCPI0_174(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,26,u,u] | |
| vpermt2ps %zmm28, %zmm3, %zmm23 | |
| vmovups 288(%rsp), %zmm28 # 64-byte Reload | |
| vmovaps .LCPI0_101(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,31,u,u] | |
| vfmadd231ps %zmm2, %zmm11, %zmm20 # zmm20 = (zmm11 * zmm2) + zmm20 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vextractf128 $1, %ymm0, %xmm2 | |
| vshufps $170, %ymm0, %ymm0, %ymm0 # ymm0 = ymm0[2,2,2,2,6,6,6,6] | |
| vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
| vbroadcastss %xmm2, %zmm2 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm11, %zmm14 # zmm14 = (zmm11 * zmm0) + zmm14 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %zmm10, %zmm10, %zmm0 # zmm0 = zmm10[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd213ps %zmm12, %zmm2, %zmm11 # zmm11 = (zmm2 * zmm11) + zmm12 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %xmm10, %xmm10, %xmm12 # xmm12 = xmm10[2,2,2,2] | |
| vshufps $170, %zmm10, %zmm10, %zmm2 # zmm2 = zmm10[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm28, %zmm7, %zmm6 | |
| vmovaps .LCPI0_102(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,31,u] | |
| vpermt2ps %zmm28, %zmm3, %zmm23 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %zmm10, %zmm10, %zmm3 # zmm3 = zmm10[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2pd %zmm25, %zmm27, %zmm6 | |
| vmovaps %zmm17, %zmm27 | |
| vpermt2ps %zmm25, %zmm7, %zmm23 | |
| vmovaps .LCPI0_176(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,26] | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $170, %zmm3, %zmm3, %zmm25 # zmm25 = zmm3[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm17, %zmm7, %zmm6 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovshdup %xmm10, %xmm7 # xmm7 = xmm10[1,1,3,3] | |
| vmovups 944(%rsp), %zmm17 # 64-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm6, %zmm30 # zmm30 = (zmm6 * zmm4) + zmm30 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm7, %zmm4 | |
| vextractf32x4 $2, %zmm10, %xmm7 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm25, %zmm6, %zmm29 # zmm29 = (zmm6 * zmm25) + zmm29 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $170, %zmm0, %zmm0, %zmm25 # zmm25 = zmm0[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm3, %zmm6, %zmm19 # zmm19 = (zmm6 * zmm3) + zmm19 | |
| vmovapd .LCPI0_88(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,15,u] | |
| vfmadd231ps %zmm4, %zmm6, %zmm26 # zmm26 = (zmm6 * zmm4) + zmm26 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm12, %zmm4 | |
| vshuff64x2 $170, %zmm2, %zmm2, %zmm12 # zmm12 = zmm2[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm6, %zmm24 # zmm24 = (zmm6 * zmm4) + zmm24 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm7, %zmm4 | |
| vshufps $255, %xmm10, %xmm10, %xmm7 # xmm7 = xmm10[3,3,3,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm12, %zmm6, %zmm21 # zmm21 = (zmm6 * zmm12) + zmm21 | |
| vmovups 96(%rsp), %zmm12 # 64-byte Reload | |
| vfmadd231ps %zmm2, %zmm6, %zmm22 # zmm22 = (zmm6 * zmm2) + zmm22 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %zmm15, %zmm15, %zmm2 # zmm2 = zmm15[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm6, %zmm17 # zmm17 = (zmm6 * zmm4) + zmm17 | |
| vmovups %zmm24, 32(%rsp) # 64-byte Spill | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vextractf32x4 $1, %ymm10, %xmm24 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2pd %zmm8, %zmm3, %zmm18 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %zmm15, %zmm15, %zmm3 # zmm3 = zmm15[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
| vbroadcastss %xmm24, %zmm4 | |
| vmovups 416(%rsp), %zmm24 # 64-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm6, %zmm11 # zmm11 = (zmm6 * zmm4) + zmm11 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %ymm10, %ymm10, %ymm4 # ymm4 = ymm10[2,2,2,2,6,6,6,6] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm25, %zmm6, %zmm12 # zmm12 = (zmm6 * zmm25) + zmm12 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm6, %zmm14 # zmm14 = (zmm6 * zmm4) + zmm14 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm7, %zmm4 | |
| vmovups (%rsi,%rax), %zmm7 {%k5} {z} | |
| movw $-32768, %ax # imm = 0x8000 | |
| kmovd %eax, %k1 | |
| movq 552(%rsp), %rax # 8-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm6, %zmm20 # zmm20 = (zmm6 * zmm4) + zmm20 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %ymm10, %ymm10, %ymm4 # ymm4 = ymm10[1,1,1,1,5,5,5,5] | |
| .loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
| vmovaps %zmm27, %zmm23 {%k1} | |
| kmovw 880(%rsp), %k1 # 2-byte Reload | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3] | |
| vmovups %zmm14, 160(%rsp) # 64-byte Spill | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm6, %zmm31 # zmm31 = (zmm6 * zmm4) + zmm31 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %ymm10, %ymm10, %ymm4 # ymm4 = ymm10[3,3,3,3,7,7,7,7] | |
| vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3] | |
| vmovups (%rsi,%rax), %zmm14 {%k2} {z} | |
| movq -8(%rsp), %rax # 8-byte Reload | |
| vmovaps %zmm20, %zmm25 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vmovaps .LCPI0_177(%rip), %zmm20 # zmm20 = [0,1,2,3,4,5,6,7,8,9,10,11,27,u,u,u] | |
| vfmadd231ps %zmm4, %zmm6, %zmm16 # zmm16 = (zmm6 * zmm4) + zmm16 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vextractf32x4 $3, %zmm10, %xmm4 | |
| vmovups %zmm31, 1072(%rsp) # 64-byte Spill | |
| vmovups 352(%rsp), %zmm31 # 64-byte Reload | |
| vbroadcastss %xmm4, %zmm4 | |
| vmovups (%rsi,%rax), %zmm10 {%k1} {z} | |
| kmovw -120(%rsp), %k1 # 2-byte Reload | |
| movq 24(%rsp), %rax # 8-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm8, %zmm20, %zmm9 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm15, %zmm20 | |
| .loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
| vmovapd %zmm8, %zmm5 {%k1} | |
| kmovw 1008(%rsp), %k1 # 2-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm6, %zmm31 # zmm31 = (zmm6 * zmm4) + zmm31 | |
| vfmadd213ps %zmm13, %zmm0, %zmm6 # zmm6 = (zmm0 * zmm6) + zmm13 | |
| vmovaps .LCPI0_77(%rip), %zmm13 # zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,29,u,u,u] | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5] | |
| vshufps $170, %zmm15, %zmm15, %zmm0 # zmm0 = zmm15[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
| vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm8, %zmm13, %zmm1 | |
| vmovaps .LCPI0_178(%rip), %zmm8 # zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,27,u,u] | |
| vmovaps %zmm30, %zmm13 | |
| vmovups 1072(%rsp), %zmm30 # 64-byte Reload | |
| vpermt2ps %zmm28, %zmm8, %zmm9 | |
| vmovaps .LCPI0_179(%rip), %zmm8 # zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,27,u] | |
| vpermt2ps %zmm24, %zmm8, %zmm9 | |
| vmovaps .LCPI0_180(%rip), %zmm8 # zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,27] | |
| vpermt2ps %zmm27, %zmm8, %zmm9 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovshdup %xmm15, %xmm8 # xmm8 = xmm15[1,1,3,3] | |
| vmovaps %zmm19, %zmm27 | |
| vmovaps %zmm24, %zmm19 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm9, %zmm12 # zmm12 = (zmm9 * zmm4) + zmm12 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $170, %zmm2, %zmm2, %zmm4 # zmm4 = zmm2[4,5,4,5,4,5,4,5] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm20, %zmm9, %zmm13 # zmm13 = (zmm9 * zmm20) + zmm13 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovups (%rsi,%rax), %zmm20 {%k1} {z} | |
| kmovw -122(%rsp), %k1 # 2-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm3, %zmm9, %zmm6 # zmm6 = (zmm9 * zmm3) + zmm6 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %zmm7, %zmm7, %zmm3 # zmm3 = zmm7[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
| .loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
| addq 504(%rsp), %rsi # 8-byte Folded Reload | |
| decl %ebx | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm9, %zmm29 # zmm29 = (zmm9 * zmm4) + zmm29 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vextractf32x4 $2, %zmm15, %xmm4 | |
| vmovups %zmm12, 96(%rsp) # 64-byte Spill | |
| vshuff64x2 $170, %zmm0, %zmm0, %zmm12 # zmm12 = zmm0[4,5,4,5,4,5,4,5] | |
| vbroadcastss %xmm4, %zmm4 | |
| vshuff64x2 $255, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[6,7,6,7,6,7,6,7] | |
| .loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
| vmovaps %zmm28, %zmm1 {%k1} | |
| kmovw -124(%rsp), %k1 # 2-byte Reload | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm12, %zmm9, %zmm21 # zmm21 = (zmm9 * zmm12) + zmm21 | |
| vfmadd231ps %zmm4, %zmm9, %zmm17 # zmm17 = (zmm9 * zmm4) + zmm17 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %xmm15, %xmm15, %xmm4 # xmm4 = xmm15[2,2,2,2] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm9, %zmm22 # zmm22 = (zmm9 * zmm0) + zmm22 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $255, %zmm2, %zmm2, %zmm0 # zmm0 = zmm2[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vmovaps .LCPI0_89(%rip), %zmm2 # zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,30,u,u] | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm4, %zmm4 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm9, %zmm27 # zmm27 = (zmm9 * zmm0) + zmm27 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vextractf32x4 $3, %zmm15, %xmm0 | |
| vmovaps %zmm21, %zmm12 | |
| vmovups 32(%rsp), %zmm21 # 64-byte Reload | |
| vbroadcastss %xmm0, %zmm0 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm9, %zmm31 # zmm31 = (zmm9 * zmm0) + zmm31 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %ymm15, %ymm15, %ymm0 # ymm0 = ymm15[3,3,3,3,7,7,7,7] | |
| vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm28, %zmm2, %zmm18 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %ymm15, %ymm15, %ymm2 # ymm2 = ymm15[2,2,2,2,6,6,6,6] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm9, %zmm16 # zmm16 = (zmm9 * zmm0) + zmm16 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %ymm15, %ymm15, %ymm0 # ymm0 = ymm15[1,1,1,1,5,5,5,5] | |
| vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3] | |
| vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm9, %zmm21 # zmm21 = (zmm9 * zmm4) + zmm21 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm8, %zmm4 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vmovaps .LCPI0_181(%rip), %zmm8 # zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,28,u,u] | |
| vfmadd231ps %zmm0, %zmm9, %zmm30 # zmm30 = (zmm9 * zmm0) + zmm30 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %xmm15, %xmm15, %xmm0 # xmm0 = xmm15[3,3,3,3] | |
| vbroadcastsd %xmm0, %zmm0 | |
| .loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
| vmovapd %zmm19, %zmm18 {%k1} | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm9, %zmm26 # zmm26 = (zmm9 * zmm4) + zmm26 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm7, %zmm4 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm9, %zmm25 # zmm25 = (zmm9 * zmm0) + zmm25 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vextractf128 $1, %ymm15, %xmm0 | |
| vmovups 160(%rsp), %zmm15 # 64-byte Reload | |
| vbroadcastss %xmm0, %zmm0 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm28, %zmm8, %zmm5 | |
| vmovapd .LCPI0_182(%rip), %zmm8 # zmm8 = [0,1,2,3,4,5,6,14] | |
| vmovups 560(%rsp), %zmm28 # 64-byte Reload | |
| vfmadd231ps %zmm2, %zmm9, %zmm15 # zmm15 = (zmm9 * zmm2) + zmm15 | |
| vfmadd213ps %zmm11, %zmm0, %zmm9 # zmm9 = (zmm0 * zmm9) + zmm11 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovshdup %xmm7, %xmm11 # xmm11 = xmm7[1,1,3,3] | |
| vshufps $170, %zmm7, %zmm7, %zmm0 # zmm0 = zmm7[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
| vshufps $255, %zmm7, %zmm7, %zmm2 # zmm2 = zmm7[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2pd %zmm24, %zmm8, %zmm5 | |
| vmovaps .LCPI0_183(%rip), %zmm8 # zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,28] | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vextractf32x4 $1, %ymm7, %xmm24 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm28, %zmm8, %zmm5 | |
| vmovaps %zmm30, %zmm8 | |
| vfmadd231ps %zmm4, %zmm5, %zmm13 # zmm13 = (zmm5 * zmm4) + zmm13 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm11, %zmm4 | |
| vextractf32x4 $2, %zmm7, %xmm11 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm5, %zmm26 # zmm26 = (zmm5 * zmm4) + zmm26 | |
| vmovaps %zmm13, %zmm30 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %xmm7, %xmm7, %xmm13 # xmm13 = xmm7[2,2,2,2] | |
| vbroadcastsd %xmm13, %zmm4 | |
| vmovaps %zmm21, %zmm13 | |
| vshuff64x2 $170, %zmm3, %zmm3, %zmm21 # zmm21 = zmm3[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm5, %zmm13 # zmm13 = (zmm5 * zmm4) + zmm13 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm11, %zmm4 | |
| vmovaps %zmm17, %zmm11 | |
| vshuff64x2 $170, %zmm0, %zmm0, %zmm17 # zmm17 = zmm0[4,5,4,5,4,5,4,5] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm21, %zmm5, %zmm29 # zmm29 = (zmm5 * zmm21) + zmm29 | |
| vmovaps %zmm12, %zmm21 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %xmm7, %xmm7, %xmm12 # xmm12 = xmm7[3,3,3,3] | |
| vshuff64x2 $255, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm17, %zmm5, %zmm21 # zmm21 = (zmm5 * zmm17) + zmm21 | |
| vmovups 96(%rsp), %zmm17 # 64-byte Reload | |
| vfmadd231ps %zmm4, %zmm5, %zmm11 # zmm11 = (zmm5 * zmm4) + zmm11 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $170, %zmm2, %zmm2, %zmm4 # zmm4 = zmm2[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm5, %zmm22 # zmm22 = (zmm5 * zmm0) + zmm22 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %zmm14, %zmm14, %zmm0 # zmm0 = zmm14[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm5, %zmm17 # zmm17 = (zmm5 * zmm4) + zmm17 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm24, %zmm4 | |
| vmovaps %zmm31, %zmm24 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm5, %zmm9 # zmm9 = (zmm5 * zmm4) + zmm9 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %ymm7, %ymm7, %ymm4 # ymm4 = ymm7[2,2,2,2,6,6,6,6] | |
| vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm5, %zmm15 # zmm15 = (zmm5 * zmm4) + zmm15 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm12, %zmm4 | |
| vmovaps %zmm27, %zmm12 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm3, %zmm5, %zmm12 # zmm12 = (zmm5 * zmm3) + zmm12 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $170, %zmm0, %zmm0, %zmm27 # zmm27 = zmm0[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[6,7,6,7,6,7,6,7] | |
| vshufps $255, %zmm14, %zmm14, %zmm3 # zmm3 = zmm14[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm5, %zmm25 # zmm25 = (zmm5 * zmm4) + zmm25 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %ymm7, %ymm7, %ymm4 # ymm4 = ymm7[1,1,1,1,5,5,5,5] | |
| vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3] | |
| vmovups %zmm15, 160(%rsp) # 64-byte Spill | |
| vmovaps %zmm28, %zmm15 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm5, %zmm8 # zmm8 = (zmm5 * zmm4) + zmm8 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %ymm7, %ymm7, %ymm4 # ymm4 = ymm7[3,3,3,3,7,7,7,7] | |
| vextractf32x4 $3, %zmm7, %xmm7 | |
| vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm5, %zmm16 # zmm16 = (zmm5 * zmm4) + zmm16 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm7, %zmm4 | |
| vmovaps %zmm13, %zmm7 | |
| vextractf32x4 $2, %zmm14, %xmm13 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm5, %zmm24 # zmm24 = (zmm5 * zmm4) + zmm24 | |
| vfmadd213ps %zmm6, %zmm2, %zmm5 # zmm5 = (zmm2 * zmm5) + zmm6 | |
| vmovaps .LCPI0_184(%rip), %zmm6 # zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,29,u] | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5] | |
| vshufps $170, %zmm14, %zmm14, %zmm2 # zmm2 = zmm14[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
| vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm19, %zmm6, %zmm1 | |
| vmovaps .LCPI0_185(%rip), %zmm6 # zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,29] | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %xmm14, %xmm14, %xmm19 # xmm19 = xmm14[2,2,2,2] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm28, %zmm6, %zmm1 | |
| vmovups 160(%rsp), %zmm28 # 64-byte Reload | |
| vmovaps %zmm26, %zmm6 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $170, %zmm2, %zmm2, %zmm26 # zmm26 = zmm2[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm1, %zmm12 # zmm12 = (zmm1 * zmm0) + zmm12 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vextractf32x4 $3, %zmm14, %xmm0 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm1, %zmm17 # zmm17 = (zmm1 * zmm4) + zmm17 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm13, %zmm4 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm26, %zmm1, %zmm21 # zmm21 = (zmm1 * zmm26) + zmm21 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm14, %zmm26 | |
| vmovshdup %xmm14, %xmm13 # xmm13 = xmm14[1,1,3,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm1, %zmm22 # zmm22 = (zmm1 * zmm2) + zmm22 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %ymm14, %ymm14, %ymm2 # ymm2 = ymm14[2,2,2,2,6,6,6,6] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm3, %zmm1, %zmm5 # zmm5 = (zmm1 * zmm3) + zmm5 | |
| vfmadd231ps %zmm27, %zmm1, %zmm29 # zmm29 = (zmm1 * zmm27) + zmm29 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %zmm10, %zmm10, %zmm3 # zmm3 = zmm10[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
| vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3] | |
| vbroadcastss %xmm0, %zmm0 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm1, %zmm11 # zmm11 = (zmm1 * zmm4) + zmm11 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm19, %zmm4 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vmovaps .LCPI0_186(%rip), %zmm19 # zmm19 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,30] | |
| vfmadd231ps %zmm26, %zmm1, %zmm30 # zmm30 = (zmm1 * zmm26) + zmm30 | |
| vfmadd231ps %zmm2, %zmm1, %zmm28 # zmm28 = (zmm1 * zmm2) + zmm28 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %zmm10, %zmm10, %zmm2 # zmm2 = zmm10[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm1, %zmm24 # zmm24 = (zmm1 * zmm0) + zmm24 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %ymm14, %ymm14, %ymm0 # ymm0 = ymm14[3,3,3,3,7,7,7,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm1, %zmm7 # zmm7 = (zmm1 * zmm4) + zmm7 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm13, %zmm4 | |
| vshufps $170, %xmm10, %xmm10, %xmm13 # xmm13 = xmm10[2,2,2,2] | |
| vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm1, %zmm6 # zmm6 = (zmm1 * zmm4) + zmm6 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm10, %zmm4 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm1, %zmm16 # zmm16 = (zmm1 * zmm0) + zmm16 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %ymm14, %ymm14, %ymm0 # ymm0 = ymm14[1,1,1,1,5,5,5,5] | |
| vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm1, %zmm8 # zmm8 = (zmm1 * zmm0) + zmm8 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %xmm14, %xmm14, %xmm0 # xmm0 = xmm14[3,3,3,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vpermt2ps %zmm15, %zmm19, %zmm18 | |
| vmovaps %zmm12, %zmm19 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm0, %zmm0 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm1, %zmm25 # zmm25 = (zmm1 * zmm0) + zmm25 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vextractf128 $1, %ymm14, %xmm0 | |
| vshuff64x2 $170, %zmm3, %zmm3, %zmm14 # zmm14 = zmm3[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7] | |
| vbroadcastss %xmm0, %zmm0 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm18, %zmm30 # zmm30 = (zmm18 * zmm4) + zmm30 | |
| vfmadd231ps %zmm14, %zmm18, %zmm29 # zmm29 = (zmm18 * zmm14) + zmm29 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $170, %zmm2, %zmm2, %zmm14 # zmm14 = zmm2[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm3, %zmm18, %zmm19 # zmm19 = (zmm18 * zmm3) + zmm19 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %zmm20, %zmm20, %zmm3 # zmm3 = zmm20[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd213ps %zmm9, %zmm0, %zmm1 # zmm1 = (zmm0 * zmm1) + zmm9 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovshdup %xmm10, %xmm9 # xmm9 = xmm10[1,1,3,3] | |
| vshufps $170, %zmm10, %zmm10, %zmm0 # zmm0 = zmm10[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm14, %zmm18, %zmm17 # zmm17 = (zmm18 * zmm14) + zmm17 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm9, %zmm4 | |
| vextractf32x4 $2, %zmm10, %xmm9 | |
| vshuff64x2 $170, %zmm0, %zmm0, %zmm15 # zmm15 = zmm0[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm18, %zmm6 # zmm6 = (zmm18 * zmm4) + zmm6 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm13, %zmm4 | |
| vextractf128 $1, %ymm10, %xmm13 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm18, %zmm22 # zmm22 = (zmm18 * zmm0) + zmm22 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %zmm20, %zmm20, %zmm0 # zmm0 = zmm20[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm15, %zmm18, %zmm21 # zmm21 = (zmm18 * zmm15) + zmm21 | |
| vmovaps %zmm17, %zmm15 | |
| vfmadd231ps %zmm4, %zmm18, %zmm7 # zmm7 = (zmm18 * zmm4) + zmm7 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm9, %zmm4 | |
| vmovaps %zmm11, %zmm9 | |
| vshufps $255, %xmm10, %xmm10, %xmm11 # xmm11 = xmm10[3,3,3,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm18, %zmm9 # zmm9 = (zmm18 * zmm4) + zmm9 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm13, %zmm4 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm18, %zmm1 # zmm1 = (zmm18 * zmm4) + zmm1 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %ymm10, %ymm10, %ymm4 # ymm4 = ymm10[2,2,2,2,6,6,6,6] | |
| vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3] | |
| vmovaps %zmm9, %zmm12 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm18, %zmm28 # zmm28 = (zmm18 * zmm4) + zmm28 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm11, %zmm4 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm18, %zmm25 # zmm25 = (zmm18 * zmm4) + zmm25 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %ymm10, %ymm10, %ymm4 # ymm4 = ymm10[1,1,1,1,5,5,5,5] | |
| vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm18, %zmm8 # zmm8 = (zmm18 * zmm4) + zmm8 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %ymm10, %ymm10, %ymm4 # ymm4 = ymm10[3,3,3,3,7,7,7,7] | |
| vextractf32x4 $3, %zmm10, %xmm10 | |
| vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm18, %zmm16 # zmm16 = (zmm18 * zmm4) + zmm16 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm10, %zmm4 | |
| vshufps $255, %zmm20, %zmm20, %zmm10 # zmm10 = zmm20[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm18, %zmm24 # zmm24 = (zmm18 * zmm4) + zmm24 | |
| vfmadd213ps %zmm5, %zmm2, %zmm18 # zmm18 = (zmm2 * zmm18) + zmm5 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $170, %zmm0, %zmm0, %zmm5 # zmm5 = zmm0[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $170, %zmm10, %zmm10, %zmm4 # zmm4 = zmm10[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $255, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[6,7,6,7,6,7,6,7] | |
| vshuff64x2 $255, %zmm10, %zmm10, %zmm2 # zmm2 = zmm10[6,7,6,7,6,7,6,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm23, %zmm15 # zmm15 = (zmm23 * zmm4) + zmm15 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm23, %zmm22 # zmm22 = (zmm23 * zmm0) + zmm22 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $255, %zmm3, %zmm3, %zmm0 # zmm0 = zmm3[6,7,6,7,6,7,6,7] | |
| vshufps $255, %xmm20, %xmm20, %xmm3 # xmm3 = xmm20[3,3,3,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm5, %zmm23, %zmm21 # zmm21 = (zmm23 * zmm5) + zmm21 | |
| vfmadd231ps %zmm4, %zmm23, %zmm29 # zmm29 = (zmm23 * zmm4) + zmm29 | |
| vfmadd231ps %zmm0, %zmm23, %zmm19 # zmm19 = (zmm23 * zmm0) + zmm19 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vextractf32x4 $3, %zmm20, %xmm0 | |
| vextractf32x4 $2, %zmm20, %xmm4 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm2, %zmm23, %zmm18 # zmm18 = (zmm23 * zmm2) + zmm18 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vextractf32x4 $1, %ymm20, %xmm2 | |
| vbroadcastss %xmm0, %zmm0 | |
| vbroadcastss %xmm4, %zmm4 | |
| vmovaps %zmm29, %zmm9 | |
| vmovaps %zmm24, %zmm29 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm23, %zmm29 # zmm29 = (zmm23 * zmm0) + zmm29 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $255, %ymm20, %ymm20, %ymm0 # ymm0 = ymm20[3,3,3,3,7,7,7,7] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm23, %zmm12 # zmm12 = (zmm23 * zmm4) + zmm12 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %xmm20, %xmm20, %xmm4 # xmm4 = xmm20[2,2,2,2] | |
| vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
| vbroadcastsd %xmm4, %zmm4 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm23, %zmm16 # zmm16 = (zmm23 * zmm0) + zmm16 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $85, %ymm20, %ymm20, %ymm0 # ymm0 = ymm20[1,1,1,1,5,5,5,5] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm23, %zmm7 # zmm7 = (zmm23 * zmm4) + zmm7 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vmovshdup %xmm20, %xmm4 # xmm4 = xmm20[1,1,3,3] | |
| vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
| vbroadcastsd %xmm4, %zmm4 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm23, %zmm8 # zmm8 = (zmm23 * zmm0) + zmm8 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastsd %xmm3, %zmm0 | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm23, %zmm6 # zmm6 = (zmm23 * zmm4) + zmm6 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm20, %zmm4 | |
| vmovups %zmm7, 32(%rsp) # 64-byte Spill | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm23, %zmm25 # zmm25 = (zmm23 * zmm0) + zmm25 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshufps $170, %ymm20, %ymm20, %ymm0 # ymm0 = ymm20[2,2,2,2,6,6,6,6] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm4, %zmm23, %zmm30 # zmm30 = (zmm23 * zmm4) + zmm30 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm23, %zmm28 # zmm28 = (zmm23 * zmm0) + zmm28 | |
| .loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
| vbroadcastss %xmm2, %zmm0 | |
| vmovups %zmm25, 224(%rsp) # 64-byte Spill | |
| .loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
| vfmadd231ps %zmm0, %zmm23, %zmm1 # zmm1 = (zmm23 * zmm0) + zmm1 | |
| .loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
| jne .LBB0_3 | |
| jmp .LBB0_4 | |
| .LBB0_1: | |
| .loc 1 0 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:0:22 | |
| vpxor %xmm0, %xmm0, %xmm0 | |
| vxorpd %xmm30, %xmm30, %xmm30 | |
| vxorps %xmm6, %xmm6, %xmm6 | |
| vpxor %xmm1, %xmm1, %xmm1 | |
| vxorps %xmm8, %xmm8, %xmm8 | |
| vpxord %xmm28, %xmm28, %xmm28 | |
| vxorps %xmm16, %xmm16, %xmm16 | |
| vxorps %xmm12, %xmm12, %xmm12 | |
| vpxor %xmm9, %xmm9, %xmm9 | |
| vpxord %xmm21, %xmm21, %xmm21 | |
| vpxor %xmm15, %xmm15, %xmm15 | |
| vpxord %xmm29, %xmm29, %xmm29 | |
| vxorps %xmm19, %xmm19, %xmm19 | |
| vpxord %xmm22, %xmm22, %xmm22 | |
| vxorps %xmm18, %xmm18, %xmm18 | |
| vmovdqu64 %zmm0, 32(%rsp) # 64-byte Spill | |
| vmovdqu64 %zmm0, 224(%rsp) # 64-byte Spill | |
| .LBB0_4: # %._crit_edge | |
| .loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21 | |
| vbroadcasti32x4 .LCPI0_21(%rip), %zmm5 # zmm5 = [0,8,0,8,0,8,0,8] | |
| # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
| vmovdqa64 %zmm9, %zmm4 | |
| vpunpckldq %xmm4, %xmm12, %xmm24 # xmm24 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] | |
| vmovdqa64 %zmm21, %zmm9 | |
| vmovdqa64 %zmm15, %zmm20 | |
| vpunpckldq %xmm20, %xmm9, %xmm0 # xmm0 = xmm9[0],xmm20[0],xmm9[1],xmm20[1] | |
| vmovdqa64 %zmm29, %zmm26 | |
| vpunpckldq %xmm19, %xmm26, %xmm13 # xmm13 = xmm26[0],xmm19[0],xmm26[1],xmm19[1] | |
| vinsertps $76, %xmm12, %xmm4, %xmm14 # xmm14 = xmm12[1],xmm4[1],zero,zero | |
| vinsertps $76, %xmm26, %xmm19, %xmm2 # xmm2 = xmm26[1],xmm19[1],zero,zero | |
| vmovdqa64 %zmm21, %zmm3 | |
| movb $-64, %al | |
| vmovaps %zmm12, %zmm23 | |
| vmovsd .LCPI0_195(%rip), %xmm17 # xmm17 = [3,7,0,0] | |
| vinsertps $76, %xmm30, %xmm6, %xmm11 # xmm11 = xmm30[1],xmm6[1],zero,zero | |
| vunpckhps %xmm6, %xmm30, %xmm27 # xmm27 = xmm30[2],xmm6[2],xmm30[3],xmm6[3] | |
| vinsertps $76, %xmm1, %xmm8, %xmm7 # xmm7 = xmm1[1],xmm8[1],zero,zero | |
| vmovdqa64 %zmm22, %zmm25 | |
| movq 480(%rsp), %r14 # 8-byte Reload | |
| movl -108(%rsp), %r12d # 4-byte Reload | |
| movq $-1, %r15 | |
| movq 496(%rsp), %r13 # 8-byte Reload | |
| kmovd %eax, %k1 | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| movl 3520(%rsp), %eax | |
| vpbroadcastd %eax, %xmm10 | |
| movl -112(%rsp), %eax # 4-byte Reload | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vpermt2pd %zmm0, %zmm5, %zmm24 | |
| vpunpckldq %xmm18, %xmm22, %xmm0 # xmm0 = xmm22[0],xmm18[0],xmm22[1],xmm18[1] | |
| vpermi2ps %xmm8, %xmm1, %xmm17 | |
| vpermt2pd %zmm0, %zmm5, %zmm13 | |
| vinsertps $76, %xmm9, %xmm20, %xmm0 # xmm0 = xmm9[1],xmm20[1],zero,zero | |
| vpermt2pd %zmm0, %zmm5, %zmm14 | |
| vinsertps $76, %xmm22, %xmm18, %xmm0 # xmm0 = xmm22[1],xmm18[1],zero,zero | |
| vpermt2pd %zmm0, %zmm5, %zmm2 | |
| vunpckhps %xmm20, %xmm9, %xmm0 # xmm0 = xmm9[2],xmm20[2],xmm9[3],xmm20[3] | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpbroadcastd %eax, %zmm31 | |
| vmovupd %zmm2, 96(%rsp) # 64-byte Spill | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vunpckhps %xmm4, %xmm12, %xmm2 # xmm2 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] | |
| vpermt2pd %zmm0, %zmm5, %zmm2 | |
| vunpckhps %xmm18, %xmm22, %xmm0 # xmm0 = xmm22[2],xmm18[2],xmm22[3],xmm18[3] | |
| vmovupd %zmm2, 160(%rsp) # 64-byte Spill | |
| vunpckhps %xmm19, %xmm26, %xmm2 # xmm2 = xmm26[2],xmm19[2],xmm26[3],xmm19[3] | |
| vpermt2pd %zmm0, %zmm5, %zmm2 | |
| vbroadcastsd .LCPI0_32(%rip), %zmm0 # zmm0 = [3,19,3,19,3,19,3,19,3,19,3,19,3,19,3,19] | |
| vshufps $51, %xmm30, %xmm6, %xmm5 # xmm5 = xmm6[3,0],xmm30[3,0] | |
| vmovaps %xmm5, 2096(%rsp) # 16-byte Spill | |
| vunpcklps %ymm6, %ymm30, %ymm5 # ymm5 = ymm30[0],ymm6[0],ymm30[1],ymm6[1],ymm30[4],ymm6[4],ymm30[5],ymm6[5] | |
| vmovups %ymm5, 2224(%rsp) # 32-byte Spill | |
| vunpckhps %ymm6, %ymm30, %ymm5 # ymm5 = ymm30[2],ymm6[2],ymm30[3],ymm6[3],ymm30[6],ymm6[6],ymm30[7],ymm6[7] | |
| vmovups %ymm5, 1456(%rsp) # 32-byte Spill | |
| vunpcklps %zmm6, %zmm30, %zmm5 # zmm5 = zmm30[0],zmm6[0],zmm30[1],zmm6[1],zmm30[4],zmm6[4],zmm30[5],zmm6[5],zmm30[8],zmm6[8],zmm30[9],zmm6[9],zmm30[12],zmm6[12],zmm30[13],zmm6[13] | |
| vmovups %zmm5, 880(%rsp) # 64-byte Spill | |
| vunpckhps %zmm6, %zmm30, %zmm5 # zmm5 = zmm30[2],zmm6[2],zmm30[3],zmm6[3],zmm30[6],zmm6[6],zmm30[7],zmm6[7],zmm30[10],zmm6[10],zmm30[11],zmm6[11],zmm30[14],zmm6[14],zmm30[15],zmm6[15] | |
| vmovupd %zmm2, 288(%rsp) # 64-byte Spill | |
| vmovaps %zmm12, %zmm2 | |
| vmovups %zmm5, 1200(%rsp) # 64-byte Spill | |
| vbroadcastsd .LCPI0_30(%rip), %ymm5 # ymm5 = [5,13,5,13,5,13,5,13] | |
| vpermt2ps %zmm4, %zmm0, %zmm2 | |
| vpermt2ps %zmm15, %zmm0, %zmm3 | |
| vshufpd $32, %zmm3, %zmm2, %zmm3 # zmm3 = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[5],zmm2[6],zmm3[6] | |
| vmovdqa64 %zmm29, %zmm2 | |
| vpermt2ps %zmm19, %zmm0, %zmm2 | |
| vpermi2ps %zmm18, %zmm22, %zmm0 | |
| vpermi2ps %ymm8, %ymm1, %ymm5 | |
| vmovups %ymm5, 2160(%rsp) # 32-byte Spill | |
| vbroadcastsd .LCPI0_13(%rip), %ymm5 # ymm5 = [7,15,7,15,7,15,7,15] | |
| vshufpd $128, %zmm0, %zmm2, %zmm3 {%k1} # zmm3 {%k1} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[7] | |
| vbroadcasti32x4 .LCPI0_25(%rip), %zmm0 # zmm0 = [4,5,4,20,4,5,4,20,4,5,4,20,4,5,4,20] | |
| # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
| vunpcklps %ymm4, %ymm12, %ymm2 # ymm2 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[4],ymm4[4],ymm12[5],ymm4[5] | |
| vmovupd %zmm3, 416(%rsp) # 64-byte Spill | |
| vpermi2ps %ymm8, %ymm1, %ymm5 | |
| vpermi2ps %zmm15, %zmm21, %zmm0 | |
| vmovups %ymm5, 1520(%rsp) # 32-byte Spill | |
| vunpcklps %xmm8, %xmm1, %xmm5 # xmm5 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] | |
| vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3] | |
| vshufpd $32, %zmm0, %zmm2, %zmm3 # zmm3 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[5],zmm2[6],zmm0[6] | |
| vbroadcastf128 .LCPI0_196(%rip), %ymm0 # ymm0 = [0,1,4,12,0,1,4,12] | |
| # ymm0 = mem[0,1,0,1] | |
| vunpcklps %ymm19, %ymm26, %ymm2 # ymm2 = ymm26[0],ymm19[0],ymm26[1],ymm19[1],ymm26[4],ymm19[4],ymm26[5],ymm19[5] | |
| vpermi2ps %ymm18, %ymm22, %ymm0 | |
| vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 | |
| vinsertf64x4 $1, %ymm2, %zmm0, %zmm2 | |
| vshufpd $128, %zmm0, %zmm2, %zmm3 {%k1} # zmm3 {%k1} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[7] | |
| vmovaps .LCPI0_188(%rip), %ymm0 # ymm0 = [1,9,u,u,5,13,u,u] | |
| vbroadcasti32x4 .LCPI0_28(%rip), %zmm2 # zmm2 = [5,21,6,7,5,21,6,7,5,21,6,7,5,21,6,7] | |
| # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
| vmovupd %zmm3, 560(%rsp) # 64-byte Spill | |
| vunpcklps %ymm20, %ymm9, %ymm3 # ymm3 = ymm9[0],ymm20[0],ymm9[1],ymm20[1],ymm9[4],ymm20[4],ymm9[5],ymm20[5] | |
| vpermi2ps %ymm19, %ymm26, %ymm0 | |
| vpermi2ps %zmm4, %zmm12, %zmm2 | |
| vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 | |
| vshuff64x2 $85, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[2,3,2,3,2,3,2,3] | |
| vshufpd $32, %zmm3, %zmm2, %zmm3 # zmm3 = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[5],zmm2[6],zmm3[6] | |
| vunpcklps %ymm18, %ymm22, %ymm2 # ymm2 = ymm22[0],ymm18[0],ymm22[1],ymm18[1],ymm22[4],ymm18[4],ymm22[5],ymm18[5] | |
| vinsertf64x4 $1, %ymm2, %zmm0, %zmm2 | |
| vshufpd $128, %zmm2, %zmm0, %zmm3 {%k1} # zmm3 {%k1} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[7] | |
| vbroadcasti32x4 .LCPI0_23(%rip), %zmm0 # zmm0 = [4,5,6,22,4,5,6,22,4,5,6,22,4,5,6,22] | |
| # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
| vunpckhps %ymm4, %ymm12, %ymm2 # ymm2 = ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[6],ymm4[6],ymm12[7],ymm4[7] | |
| vmovupd %zmm3, 688(%rsp) # 64-byte Spill | |
| vpermi2ps %zmm15, %zmm21, %zmm0 | |
| vmovaps .LCPI0_19(%rip), %zmm15 # zmm15 = [0,1,0,16,4,5,4,20,8,9,8,24,12,13,12,28] | |
| vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3] | |
| vshufpd $32, %zmm0, %zmm2, %zmm3 # zmm3 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[5],zmm2[6],zmm0[6] | |
| vbroadcastf128 .LCPI0_197(%rip), %ymm0 # ymm0 = [0,1,6,14,0,1,6,14] | |
| # ymm0 = mem[0,1,0,1] | |
| vunpckhps %ymm19, %ymm26, %ymm2 # ymm2 = ymm26[2],ymm19[2],ymm26[3],ymm19[3],ymm26[6],ymm19[6],ymm26[7],ymm19[7] | |
| vpermi2ps %ymm18, %ymm22, %ymm0 | |
| vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 | |
| vinsertf64x4 $1, %ymm2, %zmm0, %zmm2 | |
| vshufpd $128, %zmm0, %zmm2, %zmm3 {%k1} # zmm3 {%k1} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[7] | |
| vmovaps .LCPI0_191(%rip), %ymm0 # ymm0 = [3,11,u,u,7,15,u,u] | |
| vbroadcasti32x4 .LCPI0_11(%rip), %zmm2 # zmm2 = [7,23,6,7,7,23,6,7,7,23,6,7,7,23,6,7] | |
| # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
| vmovupd %zmm3, 624(%rsp) # 64-byte Spill | |
| vunpckhps %ymm20, %ymm9, %ymm3 # ymm3 = ymm9[2],ymm20[2],ymm9[3],ymm20[3],ymm9[6],ymm20[6],ymm9[7],ymm20[7] | |
| vpermi2ps %ymm19, %ymm26, %ymm0 | |
| vpermi2ps %zmm4, %zmm12, %zmm2 | |
| vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 | |
| vshuff64x2 $85, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[2,3,2,3,2,3,2,3] | |
| vshufpd $32, %zmm3, %zmm2, %zmm3 # zmm3 = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[5],zmm2[6],zmm3[6] | |
| vunpckhps %ymm18, %ymm22, %ymm2 # ymm2 = ymm22[2],ymm18[2],ymm22[3],ymm18[3],ymm22[6],ymm18[6],ymm22[7],ymm18[7] | |
| vinsertf64x4 $1, %ymm2, %zmm0, %zmm2 | |
| vshufpd $128, %zmm2, %zmm0, %zmm3 {%k1} # zmm3 {%k1} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[7] | |
| vmovaps .LCPI0_18(%rip), %zmm0 # zmm0 = [1,17,2,3,5,21,6,7,9,25,10,11,13,29,14,15] | |
| vmovdqa64 %zmm29, %zmm2 | |
| vmovupd %zmm3, 1072(%rsp) # 64-byte Spill | |
| vunpcklps %zmm4, %zmm12, %zmm3 # zmm3 = zmm12[0],zmm4[0],zmm12[1],zmm4[1],zmm12[4],zmm4[4],zmm12[5],zmm4[5],zmm12[8],zmm4[8],zmm12[9],zmm4[9],zmm12[12],zmm4[12],zmm12[13],zmm4[13] | |
| vpermt2ps %zmm19, %zmm0, %zmm2 | |
| vmovups %zmm2, 352(%rsp) # 64-byte Spill | |
| vmovaps %zmm0, %zmm2 | |
| vpunpckldq %zmm19, %zmm29, %zmm0 # zmm0 = zmm29[0],zmm19[0],zmm29[1],zmm19[1],zmm29[4],zmm19[4],zmm29[5],zmm19[5],zmm29[8],zmm19[8],zmm29[9],zmm19[9],zmm29[12],zmm19[12],zmm29[13],zmm19[13] | |
| vpermt2ps %zmm4, %zmm2, %zmm23 | |
| vmovdqu64 %zmm0, 1264(%rsp) # 64-byte Spill | |
| vpunpckhdq %zmm19, %zmm29, %zmm0 # zmm0 = zmm29[2],zmm19[2],zmm29[3],zmm19[3],zmm29[6],zmm19[6],zmm29[7],zmm19[7],zmm29[10],zmm19[10],zmm29[11],zmm19[11],zmm29[14],zmm19[14],zmm29[15],zmm19[15] | |
| vunpckhps %zmm4, %zmm12, %zmm29 # zmm29 = zmm12[2],zmm4[2],zmm12[3],zmm4[3],zmm12[6],zmm4[6],zmm12[7],zmm4[7],zmm12[10],zmm4[10],zmm12[11],zmm4[11],zmm12[14],zmm4[14],zmm12[15],zmm4[15] | |
| vmovdqu64 %zmm0, 1328(%rsp) # 64-byte Spill | |
| vmovaps .LCPI0_15(%rip), %zmm0 # zmm0 = [3,19,2,3,7,23,6,7,11,27,10,11,15,31,14,15] | |
| vpermt2ps %zmm4, %zmm0, %zmm12 | |
| vbroadcasti128 .LCPI0_29(%rip), %ymm4 # ymm4 = [5,13,6,7,5,13,6,7] | |
| # ymm4 = mem[0,1,0,1] | |
| vpermt2ps %zmm19, %zmm0, %zmm26 | |
| vpermi2ps %ymm6, %ymm30, %ymm4 | |
| vmovups %ymm4, 2288(%rsp) # 32-byte Spill | |
| vbroadcasti128 .LCPI0_12(%rip), %ymm4 # ymm4 = [7,15,6,7,7,15,6,7] | |
| # ymm4 = mem[0,1,0,1] | |
| vpermi2ps %ymm6, %ymm30, %ymm4 | |
| vmovups %ymm4, 1712(%rsp) # 32-byte Spill | |
| vmovaps %zmm30, %zmm4 | |
| vpermt2ps %zmm6, %zmm2, %zmm4 | |
| vpermi2ps %zmm8, %zmm1, %zmm2 | |
| vmovups %zmm2, 1904(%rsp) # 64-byte Spill | |
| vunpckhps %xmm8, %xmm1, %xmm2 # xmm2 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] | |
| vmovups %zmm4, 1968(%rsp) # 64-byte Spill | |
| vunpcklps %xmm6, %xmm30, %xmm4 # xmm4 = xmm30[0],xmm6[0],xmm30[1],xmm6[1] | |
| vpermt2ps %zmm6, %zmm0, %zmm30 | |
| vmovaps %zmm9, %zmm6 | |
| vmovaps %xmm2, 2032(%rsp) # 16-byte Spill | |
| vunpcklps %ymm8, %ymm1, %ymm2 # ymm2 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5] | |
| vmovups %ymm2, 784(%rsp) # 32-byte Spill | |
| vunpckhps %ymm8, %ymm1, %ymm2 # ymm2 = ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[6],ymm8[6],ymm1[7],ymm8[7] | |
| vmovups %ymm2, 752(%rsp) # 32-byte Spill | |
| vunpcklps %zmm8, %zmm1, %zmm2 # zmm2 = zmm1[0],zmm8[0],zmm1[1],zmm8[1],zmm1[4],zmm8[4],zmm1[5],zmm8[5],zmm1[8],zmm8[8],zmm1[9],zmm8[9],zmm1[12],zmm8[12],zmm1[13],zmm8[13] | |
| vmovups %zmm2, 1840(%rsp) # 64-byte Spill | |
| vunpckhps %zmm8, %zmm1, %zmm2 # zmm2 = zmm1[2],zmm8[2],zmm1[3],zmm8[3],zmm1[6],zmm8[6],zmm1[7],zmm8[7],zmm1[10],zmm8[10],zmm1[11],zmm8[11],zmm1[14],zmm8[14],zmm1[15],zmm8[15] | |
| vpermt2ps %zmm8, %zmm0, %zmm1 | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpmulld %xmm10, %xmm31, %xmm0 | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vunpcklps %zmm20, %zmm9, %zmm8 # zmm8 = zmm9[0],zmm20[0],zmm9[1],zmm20[1],zmm9[4],zmm20[4],zmm9[5],zmm20[5],zmm9[8],zmm20[8],zmm9[9],zmm20[9],zmm9[12],zmm20[12],zmm9[13],zmm20[13] | |
| vmovups %zmm2, 1136(%rsp) # 64-byte Spill | |
| vmovaps %zmm21, %zmm2 | |
| vpermt2ps %zmm20, %zmm15, %zmm2 | |
| vmovaps .LCPI0_17(%rip), %zmm21 # zmm21 = [0,1,2,18,4,5,6,22,8,9,10,26,12,13,14,30] | |
| vmovups %zmm30, 1008(%rsp) # 64-byte Spill | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpextrd $3, %xmm0, %eax | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vunpckhps %zmm20, %zmm9, %zmm0 # zmm0 = zmm9[2],zmm20[2],zmm9[3],zmm20[3],zmm9[6],zmm20[6],zmm9[7],zmm20[7],zmm9[10],zmm20[10],zmm9[11],zmm20[11],zmm9[14],zmm20[14],zmm9[15],zmm20[15] | |
| vmovaps %zmm3, %zmm9 | |
| .loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
| cltq | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vshufpd $32, %zmm2, %zmm3, %zmm30 # zmm30 = zmm3[0],zmm2[0],zmm3[2],zmm2[2],zmm3[4],zmm2[5],zmm3[6],zmm2[6] | |
| vmovapd .LCPI0_16(%rip), %zmm3 # zmm3 = [2,10,2,10,6,15,6,14] | |
| vpermt2ps %zmm20, %zmm21, %zmm6 | |
| vpermt2pd %zmm2, %zmm3, %zmm9 | |
| vshufpd $32, %zmm8, %zmm23, %zmm2 # zmm2 = zmm23[0],zmm8[0],zmm23[2],zmm8[2],zmm23[4],zmm8[5],zmm23[6],zmm8[6] | |
| vpermt2pd %zmm8, %zmm3, %zmm23 | |
| vmovupd %zmm2, 1584(%rsp) # 64-byte Spill | |
| vshufpd $32, %zmm6, %zmm29, %zmm2 # zmm2 = zmm29[0],zmm6[0],zmm29[2],zmm6[2],zmm29[4],zmm6[5],zmm29[6],zmm6[6] | |
| vpermt2pd %zmm6, %zmm3, %zmm29 | |
| vmovupd %zmm2, 1648(%rsp) # 64-byte Spill | |
| vshufpd $32, %zmm0, %zmm12, %zmm2 # zmm2 = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[5],zmm12[6],zmm0[6] | |
| vpermt2pd %zmm0, %zmm3, %zmm12 | |
| vmovdqu64 3248(%rsp), %zmm3 # 64-byte Reload | |
| vmovupd %zmm2, 1776(%rsp) # 64-byte Spill | |
| vmovapd %zmm24, %zmm2 | |
| vmovups 224(%rsp), %zmm24 # 64-byte Reload | |
| vmovapd %zmm13, %zmm2 {%k1} | |
| vmovupd %zmm9, 2352(%rsp) # 64-byte Spill | |
| vmovupd %zmm29, 816(%rsp) # 64-byte Spill | |
| vmovupd %zmm12, 944(%rsp) # 64-byte Spill | |
| vmovups 32(%rsp), %zmm12 # 64-byte Reload | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpshufd $85, %xmm3, %xmm0 # xmm0 = xmm3[1,1,1,1] | |
| vextracti128 $1, %ymm3, %xmm6 | |
| vpshufd $85, %zmm3, %zmm20 # zmm20 = zmm3[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
| vpbroadcastq %xmm0, %zmm9 | |
| vpbroadcastd %xmm6, %zmm22 | |
| vpmulld %xmm10, %xmm6, %xmm6 | |
| vpmulld %xmm10, %xmm9, %xmm0 | |
| vpextrd $3, %xmm6, %edi | |
| vpextrd $3, %xmm0, %ebp | |
| vpshufd $250, %xmm3, %xmm0 # xmm0 = xmm3[2,2,3,3] | |
| .loc 1 236 21 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:21 | |
| movslq %edi, %rdi | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpbroadcastq %xmm0, %zmm8 | |
| .loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
| leaq (%r13,%rdi,4), %rdi | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpmulld %xmm10, %xmm8, %xmm0 | |
| vpextrd $3, %xmm0, %ebx | |
| vpshufd $255, %xmm3, %xmm0 # xmm0 = xmm3[3,3,3,3] | |
| vpbroadcastq %xmm0, %zmm29 | |
| .loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
| movslq %ebx, %rbx | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpmulld %xmm10, %xmm29, %xmm0 | |
| .loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
| leaq (%r13,%rbx,4), %rbx | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpextrd $3, %xmm0, %r11d | |
| vpmulld %xmm10, %xmm22, %xmm0 | |
| .loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
| movslq %r11d, %r11 | |
| leaq (%r13,%r11,4), %r11 | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpextrd $3, %xmm0, %r8d | |
| .loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21 | |
| vmovddup .LCPI0_194(%rip), %xmm0 # xmm0 = [4,0,4,0] | |
| # xmm0 = mem[0,0] | |
| .loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
| movslq %r8d, %r8 | |
| leaq (%r13,%r8,4), %r8 | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vpermi2ps %xmm28, %xmm16, %xmm0 | |
| vinsertf128 $1, %xmm0, %ymm0, %ymm0 | |
| vinsertf128 $1, %xmm5, %ymm0, %ymm5 | |
| vblendps $192, %ymm0, %ymm5, %ymm0 # ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] | |
| vmovlhps %xmm12, %xmm24, %xmm5 # xmm5 = xmm24[0],xmm12[0] | |
| vshufps $36, %xmm5, %xmm4, %xmm5 # xmm5 = xmm4[0,1],xmm5[2,0] | |
| vblendps $15, %ymm5, %ymm0, %ymm0 # ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] | |
| vinsertf64x4 $0, %ymm0, %zmm2, %zmm13 | |
| vunpcklps %xmm24, %xmm12, %xmm0 # xmm0 = xmm12[0],xmm24[0],xmm12[1],xmm24[1] | |
| vinsertf128 $1, %xmm7, %ymm0, %ymm5 | |
| vunpcklps %xmm16, %xmm28, %xmm7 # xmm7 = xmm28[0],xmm16[0],xmm28[1],xmm16[1] | |
| vblendps $3, %xmm11, %xmm0, %xmm2 # xmm2 = xmm11[0,1],xmm0[2,3] | |
| vinsertf128 $1, %xmm7, %ymm0, %ymm7 | |
| vblendps $192, %ymm7, %ymm5, %ymm0 # ymm0 = ymm5[0,1,2,3,4,5],ymm7[6,7] | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpshufd $85, %ymm3, %ymm5 # ymm5 = ymm3[1,1,1,1,5,5,5,5] | |
| vextracti128 $1, %ymm5, %xmm7 | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vblendps $15, %ymm2, %ymm0, %ymm0 # ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] | |
| vinsertps $179, %xmm24, %xmm12, %xmm2 # xmm2 = zero,zero,xmm12[2],xmm24[2] | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpmulld %xmm10, %xmm7, %xmm7 | |
| vpextrd $3, %xmm7, %r10d | |
| vpshufd $170, %ymm3, %ymm7 # ymm7 = ymm3[2,2,2,2,6,6,6,6] | |
| vextracti128 $1, %ymm7, %xmm11 | |
| vpmulld %xmm10, %xmm11, %xmm11 | |
| vpextrd $3, %xmm11, %r9d | |
| vmovdqa 1392(%rsp), %xmm11 # 16-byte Reload | |
| .loc 1 236 21 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:21 | |
| movslq %r9d, %r9 | |
| leaq (%r13,%r9,4), %r9 | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpbroadcastd %xmm11, %zmm19 | |
| vpmulld %xmm10, %xmm19, %xmm6 | |
| vpextrd $3, %xmm6, %esi | |
| vextracti32x4 $2, %zmm20, %xmm6 | |
| vpmulld %xmm10, %xmm6, %xmm6 | |
| .loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
| movslq %esi, %rsi | |
| leaq (%r13,%rsi,4), %rsi | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpextrd $3, %xmm6, %edx | |
| vpshufd $170, %zmm3, %zmm6 # zmm6 = zmm3[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
| vextracti32x4 $2, %zmm6, %xmm4 | |
| .loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
| movslq %edx, %rdx | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpmulld %xmm10, %xmm4, %xmm4 | |
| .loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
| leaq (%r13,%rdx,4), %rdx | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpextrd $3, %xmm4, %ecx | |
| vmovupd 96(%rsp), %zmm4 # 64-byte Reload | |
| .loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
| movslq %ecx, %rcx | |
| leaq (%r13,%rcx,4), %rcx | |
| .loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21 | |
| vmovapd %zmm4, %zmm14 {%k1} | |
| vmovaps %xmm27, %xmm4 | |
| vinsertf64x4 $0, %ymm0, %zmm14, %zmm0 | |
| vblendps $3, %xmm4, %xmm2, %xmm2 # xmm2 = xmm4[0,1],xmm2[2,3] | |
| vinsertf128 $1, 2032(%rsp), %ymm0, %ymm4 # 16-byte Folded Reload | |
| vinsertps $179, %xmm16, %xmm28, %xmm27 # xmm27 = zero,zero,xmm28[2],xmm16[2] | |
| vinsertf32x4 $1, %xmm27, %ymm0, %ymm14 | |
| vmovupd 1840(%rsp), %zmm27 # 64-byte Reload | |
| vblendpd $8, %ymm14, %ymm4, %ymm14 # ymm14 = ymm4[0,1,2],ymm14[3] | |
| .loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
| vpbroadcastd %r14d, %zmm4 | |
| xorl %r14d, %r14d | |
| .loc 1 238 58 is_stmt 0 # 03-matrix-multiplication-cpu.py:238:58 | |
| cmpl 488(%rsp), %r12d # 4-byte Folded Reload | |
| .loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
| vpcmpgtd %zmm9, %zmm4, %k3 | |
| vpcmpgtd %zmm31, %zmm4, %k2 | |
| vpcmpgtd %zmm29, %zmm4, %k5 | |
| vpcmpgtd %zmm8, %zmm4, %k4 | |
| vmovupd 288(%rsp), %zmm9 # 64-byte Reload | |
| .loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21 | |
| vblendpd $3, %ymm2, %ymm14, %ymm2 # ymm2 = ymm2[0,1],ymm14[2,3] | |
| vmovaps 2096(%rsp), %xmm14 # 16-byte Reload | |
| vmovupd 2352(%rsp), %zmm29 # 64-byte Reload | |
| .loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
| kunpckwd %k2, %k3, %k2 | |
| kunpckwd %k4, %k5, %k3 | |
| kunpckdq %k2, %k3, %k2 | |
| .loc 1 238 39 is_stmt 0 # 03-matrix-multiplication-cpu.py:238:39 | |
| kshiftrq $15, %k2, %k2 | |
| cmovgeq %r14, %r15 | |
| .loc 1 236 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:236:21 | |
| leaq (%r13,%rax,4), %r14 | |
| .loc 1 236 52 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:52 | |
| movslq %r12d, %rax | |
| .loc 1 238 39 is_stmt 1 # 03-matrix-multiplication-cpu.py:238:39 | |
| kmovq %r15, %k0 | |
| kandq %k0, %k2, %k2 | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vpmovm2d %k2, %zmm8 | |
| vpbroadcastd %xmm8, %zmm8 | |
| kshiftrq $32, %k2, %k4 | |
| vpmovd2m %zmm8, %k3 | |
| vmovupd 160(%rsp), %zmm8 # 64-byte Reload | |
| vmovups %zmm13, (%r14,%rax,4) {%k3} | |
| .loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
| movslq %ebp, %r14 | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| kshiftrq $16, %k2, %k3 | |
| kshiftrq $48, %k2, %k2 | |
| .loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
| leaq (%r13,%r14,4), %r14 | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vmovapd %zmm9, %zmm8 {%k1} | |
| vunpckhps %xmm16, %xmm28, %xmm9 # xmm9 = xmm28[2],xmm16[2],xmm28[3],xmm16[3] | |
| vinsertf64x4 $0, %ymm2, %zmm8, %zmm8 | |
| vinsertf32x4 $1, %xmm17, %ymm0, %ymm2 | |
| vpmovm2d %k3, %zmm17 | |
| vpbroadcastd %xmm17, %zmm17 | |
| vinsertf128 $1, %xmm9, %ymm0, %ymm9 | |
| vpmovd2m %zmm17, %k3 | |
| vmovups 1968(%rsp), %zmm17 # 64-byte Reload | |
| vmovups %zmm0, (%r14,%rax,4) {%k3} | |
| vpmovm2d %k4, %zmm0 | |
| vpbroadcastd %xmm0, %zmm0 | |
| .loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
| vpcmpgtd %zmm22, %zmm4, %k3 | |
| vmovups 880(%rsp), %zmm22 # 64-byte Reload | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vblendpd $8, %ymm9, %ymm2, %ymm2 # ymm2 = ymm2[0,1,2],ymm9[3] | |
| vunpckhps %xmm24, %xmm12, %xmm9 # xmm9 = xmm12[2],xmm24[2],xmm12[3],xmm24[3] | |
| vpmovd2m %zmm0, %k4 | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vshufi64x2 $85, %zmm5, %zmm5, %zmm0 # zmm0 = zmm5[2,3,2,3,2,3,2,3] | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vunpckhps %ymm24, %ymm12, %ymm5 # ymm5 = ymm12[2],ymm24[2],ymm12[3],ymm24[3],ymm12[6],ymm24[6],ymm12[7],ymm24[7] | |
| vshufps $226, %xmm9, %xmm14, %xmm9 # xmm9 = xmm14[2,0],xmm9[2,3] | |
| vmovups 784(%rsp), %ymm14 # 32-byte Reload | |
| vmovups %zmm8, (%rbx,%rax,4) {%k4} | |
| vpermpd $170, %ymm5, %ymm5 # ymm5 = ymm5[2,2,2,2] | |
| vblendpd $3, %ymm9, %ymm2, %ymm2 # ymm2 = ymm9[0,1],ymm2[2,3] | |
| vmovupd 416(%rsp), %zmm9 # 64-byte Reload | |
| .loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
| vpcmpgtd %zmm0, %zmm4, %k4 | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vpmovm2d %k2, %zmm0 | |
| vpbroadcastd %xmm0, %zmm0 | |
| vpmovd2m %zmm0, %k2 | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vshufi64x2 $85, %zmm7, %zmm7, %zmm0 # zmm0 = zmm7[2,3,2,3,2,3,2,3] | |
| .loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
| kunpckwd %k3, %k4, %k3 | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vmovaps %zmm12, %zmm7 | |
| vpermt2ps %zmm24, %zmm15, %zmm7 | |
| vinsertf64x4 $0, %ymm2, %zmm9, %zmm13 | |
| vmovups 2224(%rsp), %ymm2 # 32-byte Reload | |
| vunpcklps %ymm24, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm24[0],ymm12[1],ymm24[1],ymm12[4],ymm24[4],ymm12[5],ymm24[5] | |
| vpermpd $170, %ymm9, %ymm9 # ymm9 = ymm9[2,2,2,2] | |
| vmovups %zmm13, (%r11,%rax,4) {%k2} | |
| .loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
| vpcmpgtd %zmm0, %zmm4, %k2 | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpshufd $255, %ymm3, %ymm0 # ymm0 = ymm3[3,3,3,3,7,7,7,7] | |
| vshufi64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
| .loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
| vpcmpgtd %zmm0, %zmm4, %k5 | |
| kunpckwd %k2, %k5, %k2 | |
| kunpckdq %k3, %k2, %k2 | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vextractf128 $1, %ymm2, %xmm2 | |
| .loc 1 238 39 # 03-matrix-multiplication-cpu.py:238:39 | |
| kshiftrq $15, %k2, %k2 | |
| kandq %k0, %k2, %k5 | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vblendps $3, %xmm2, %xmm9, %xmm2 # xmm2 = xmm2[0,1],xmm9[2,3] | |
| vunpcklpd %ymm28, %ymm16, %ymm9 # ymm9 = ymm16[0],ymm28[0],ymm16[2],ymm28[2] | |
| vpmovm2d %k5, %zmm0 | |
| vpbroadcastd %xmm0, %zmm0 | |
| kshiftrq $32, %k5, %k6 | |
| vshufps $36, %ymm9, %ymm14, %ymm9 # ymm9 = ymm14[0,1],ymm9[2,0],ymm14[4,5],ymm9[6,4] | |
| vpmovd2m %zmm0, %k2 | |
| vblendps $15, %ymm2, %ymm9, %ymm2 # ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] | |
| vmovups 560(%rsp), %zmm9 # 64-byte Reload | |
| vinsertf64x4 $0, %ymm2, %zmm9, %zmm14 | |
| vmovddup .LCPI0_30(%rip), %xmm9 # xmm9 = [5,13,5,13] | |
| # xmm9 = mem[0,0] | |
| vmovups 2288(%rsp), %ymm2 # 32-byte Reload | |
| vmovups %zmm14, (%r8,%rax,4) {%k2} | |
| .loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
| movslq %r10d, %r8 | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| kshiftrq $16, %k5, %k2 | |
| kshiftrq $48, %k5, %k5 | |
| vmovups 1200(%rsp), %zmm14 # 64-byte Reload | |
| .loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
| leaq (%r13,%r8,4), %r8 | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vpermi2ps %ymm24, %ymm12, %ymm9 | |
| vextractf128 $1, %ymm2, %xmm2 | |
| vblendps $3, %xmm2, %xmm9, %xmm2 # xmm2 = xmm2[0,1],xmm9[2,3] | |
| vunpcklps %ymm16, %ymm28, %ymm9 # ymm9 = ymm28[0],ymm16[0],ymm28[1],ymm16[1],ymm28[4],ymm16[4],ymm28[5],ymm16[5] | |
| vblendps $63, 2160(%rsp), %ymm9, %ymm9 # 32-byte Folded Reload | |
| # ymm9 = mem[0,1,2,3,4,5],ymm9[6,7] | |
| vblendps $15, %ymm2, %ymm9, %ymm0 # ymm0 = ymm2[0,1,2,3],ymm9[4,5,6,7] | |
| vmovups 688(%rsp), %zmm2 # 64-byte Reload | |
| vunpcklps %zmm16, %zmm28, %zmm9 # zmm9 = zmm28[0],zmm16[0],zmm28[1],zmm16[1],zmm28[4],zmm16[4],zmm28[5],zmm16[5],zmm28[8],zmm16[8],zmm28[9],zmm16[9],zmm28[12],zmm16[12],zmm28[13],zmm16[13] | |
| vinsertf64x4 $0, %ymm0, %zmm2, %zmm0 | |
| vpmovm2d %k2, %zmm2 | |
| vpbroadcastd %xmm2, %zmm2 | |
| vpmovd2m %zmm2, %k2 | |
| vmovups %zmm0, (%r8,%rax,4) {%k2} | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpmulld %xmm10, %xmm11, %xmm0 | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vmovapd .LCPI0_20(%rip), %zmm11 # zmm11 = [0,8,0,8,4,12,4,13] | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpextrd $3, %xmm0, %r8d | |
| vshufi64x2 $255, %zmm20, %zmm20, %zmm0 # zmm0 = zmm20[6,7,6,7,6,7,6,7] | |
| .loc 1 236 21 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:21 | |
| movslq %r8d, %r8 | |
| .loc 1 238 33 is_stmt 1 # 03-matrix-multiplication-cpu.py:238:33 | |
| vpcmpgtd %zmm0, %zmm4, %k2 | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vshufi64x2 $255, %zmm6, %zmm6, %zmm0 # zmm0 = zmm6[6,7,6,7,6,7,6,7] | |
| .loc 1 236 21 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:21 | |
| leaq (%r13,%r8,4), %r8 | |
| .loc 1 238 33 is_stmt 1 # 03-matrix-multiplication-cpu.py:238:33 | |
| vpcmpgtd %zmm0, %zmm4, %k3 | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpshufd $255, %zmm3, %zmm0 # zmm0 = zmm3[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
| vmovups 752(%rsp), %ymm3 # 32-byte Reload | |
| vshufi64x2 $255, %zmm0, %zmm0, %zmm2 # zmm2 = zmm0[6,7,6,7,6,7,6,7] | |
| vshufi64x2 $170, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[4,5,4,5,4,5,4,5] | |
| .loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
| vpcmpgtd %zmm2, %zmm4, %k4 | |
| vmovups 1456(%rsp), %ymm2 # 32-byte Reload | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vextractf128 $1, %ymm2, %xmm2 | |
| vblendps $3, %xmm2, %xmm5, %xmm2 # xmm2 = xmm2[0,1],xmm5[2,3] | |
| vunpckhpd %ymm28, %ymm16, %ymm5 # ymm5 = ymm16[1],ymm28[1],ymm16[3],ymm28[3] | |
| vshufps $36, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0,1],ymm5[2,0],ymm3[4,5],ymm5[6,4] | |
| vmovups 624(%rsp), %zmm3 # 64-byte Reload | |
| vblendps $15, %ymm2, %ymm5, %ymm2 # ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] | |
| vpmovm2d %k6, %zmm5 | |
| vpbroadcastd %xmm5, %zmm5 | |
| vpmovd2m %zmm5, %k6 | |
| vmovddup .LCPI0_13(%rip), %xmm5 # xmm5 = [7,15,7,15] | |
| # xmm5 = mem[0,0] | |
| vinsertf64x4 $0, %ymm2, %zmm3, %zmm2 | |
| vmovups 1072(%rsp), %zmm3 # 64-byte Reload | |
| vmovups %zmm2, (%r9,%rax,4) {%k6} | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vshufi64x2 $170, %zmm20, %zmm20, %zmm2 # zmm2 = zmm20[4,5,4,5,4,5,4,5] | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vpermi2ps %ymm24, %ymm12, %ymm5 | |
| .loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
| vpcmpgtd %zmm19, %zmm4, %k6 | |
| vmovupd 1904(%rsp), %zmm19 # 64-byte Reload | |
| vpcmpgtd %zmm2, %zmm4, %k7 | |
| vmovups 1712(%rsp), %ymm2 # 32-byte Reload | |
| kunpckwd %k6, %k7, %k6 | |
| vpcmpgtd %zmm0, %zmm4, %k7 | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vmovdqa64 %zmm25, %zmm0 | |
| vpermt2ps %zmm18, %zmm15, %zmm0 | |
| vpermi2ps %zmm16, %zmm28, %zmm15 | |
| vextractf128 $1, %ymm2, %xmm2 | |
| vblendps $3, %xmm2, %xmm5, %xmm2 # xmm2 = xmm2[0,1],xmm5[2,3] | |
| vunpckhps %ymm16, %ymm28, %ymm5 # ymm5 = ymm28[2],ymm16[2],ymm28[3],ymm16[3],ymm28[6],ymm16[6],ymm28[7],ymm16[7] | |
| vblendps $63, 1520(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload | |
| # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] | |
| vblendps $15, %ymm2, %ymm5, %ymm2 # ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] | |
| vpmovm2d %k5, %zmm5 | |
| vpbroadcastd %xmm5, %zmm5 | |
| vinsertf64x4 $0, %ymm2, %zmm3, %zmm2 | |
| vpmovd2m %zmm5, %k5 | |
| vmovupd 1584(%rsp), %zmm3 # 64-byte Reload | |
| vmovups %zmm2, (%rdi,%rax,4) {%k5} | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vshufi64x2 $170, %zmm6, %zmm6, %zmm2 # zmm2 = zmm6[4,5,4,5,4,5,4,5] | |
| .loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
| vpcmpgtd %zmm2, %zmm4, %k5 | |
| vmovupd 1264(%rsp), %zmm2 # 64-byte Reload | |
| kunpckwd %k5, %k7, %k5 | |
| kunpckdq %k6, %k5, %k5 | |
| .loc 1 238 39 is_stmt 0 # 03-matrix-multiplication-cpu.py:238:39 | |
| kshiftrq $15, %k5, %k5 | |
| kandq %k0, %k5, %k5 | |
| .loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21 | |
| vshufpd $128, %zmm0, %zmm2, %zmm29 {%k1} # zmm29 {%k1} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[7] | |
| vpermt2pd %zmm0, %zmm11, %zmm2 | |
| vextractf32x4 $2, %zmm7, %xmm0 | |
| vmovapd %zmm2, %zmm30 {%k1} | |
| vextractf32x4 $2, %zmm22, %xmm2 | |
| vblendps $3, %xmm2, %xmm0, %xmm0 # xmm0 = xmm2[0,1],xmm0[2,3] | |
| vshuff64x2 $170, %zmm15, %zmm15, %zmm2 # zmm2 = zmm15[4,5,4,5,4,5,4,5] | |
| vshuff64x2 $170, %zmm27, %zmm27, %zmm5 # zmm5 = zmm27[4,5,4,5,4,5,4,5] | |
| vblendpd $8, %ymm2, %ymm5, %ymm2 # ymm2 = ymm5[0,1,2],ymm2[3] | |
| vblendpd $3, %ymm0, %ymm2, %ymm0 # ymm0 = ymm0[0,1],ymm2[2,3] | |
| vpmovm2d %k5, %zmm2 | |
| vpbroadcastd %xmm2, %zmm2 | |
| vpmovd2m %zmm2, %k6 | |
| vmovupd 352(%rsp), %zmm2 # 64-byte Reload | |
| vinsertf64x4 $0, %ymm0, %zmm30, %zmm0 | |
| vmovupd 816(%rsp), %zmm30 # 64-byte Reload | |
| vmovups %zmm0, (%rsi,%rax,4) {%k6} | |
| vpunpckldq %zmm18, %zmm25, %zmm0 # zmm0 = zmm25[0],zmm18[0],zmm25[1],zmm18[1],zmm25[4],zmm18[4],zmm25[5],zmm18[5],zmm25[8],zmm18[8],zmm25[9],zmm18[9],zmm25[12],zmm18[12],zmm25[13],zmm18[13] | |
| kshiftrq $16, %k5, %k6 | |
| vshufpd $128, %zmm0, %zmm2, %zmm23 {%k1} # zmm23 {%k1} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[7] | |
| vpermt2pd %zmm0, %zmm11, %zmm2 | |
| vunpcklps %zmm24, %zmm12, %zmm0 # zmm0 = zmm12[0],zmm24[0],zmm12[1],zmm24[1],zmm12[4],zmm24[4],zmm12[5],zmm24[5],zmm12[8],zmm24[8],zmm12[9],zmm24[9],zmm12[12],zmm24[12],zmm12[13],zmm24[13] | |
| vextractf32x4 $2, %zmm0, %xmm5 | |
| vshuff64x2 $170, %zmm9, %zmm9, %zmm8 # zmm8 = zmm9[4,5,4,5,4,5,4,5] | |
| vmovapd %zmm2, %zmm3 {%k1} | |
| vextractf32x4 $2, %zmm17, %xmm2 | |
| vblendps $3, %xmm2, %xmm5, %xmm2 # xmm2 = xmm2[0,1],xmm5[2,3] | |
| vshuff64x2 $170, %zmm19, %zmm19, %zmm5 # zmm5 = zmm19[4,5,4,5,4,5,4,5] | |
| vblendpd $8, %ymm8, %ymm5, %ymm5 # ymm5 = ymm5[0,1,2],ymm8[3] | |
| vextractf32x4 $2, %zmm14, %xmm8 | |
| vblendpd $3, %ymm2, %ymm5, %ymm2 # ymm2 = ymm2[0,1],ymm5[2,3] | |
| vpmovm2d %k6, %zmm5 | |
| vpbroadcastd %xmm5, %zmm5 | |
| vpmovd2m %zmm5, %k6 | |
| vmovupd 1328(%rsp), %zmm5 # 64-byte Reload | |
| vinsertf64x4 $0, %ymm2, %zmm3, %zmm2 | |
| vmovupd 1648(%rsp), %zmm3 # 64-byte Reload | |
| vmovups %zmm2, (%rdx,%rax,4) {%k6} | |
| vmovdqa64 %zmm25, %zmm2 | |
| vpermt2ps %zmm18, %zmm21, %zmm2 | |
| kshiftrq $32, %k5, %k6 | |
| vshufpd $128, %zmm2, %zmm5, %zmm30 {%k1} # zmm30 {%k1} = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[7] | |
| vpermt2pd %zmm2, %zmm11, %zmm5 | |
| vpunpckhdq %zmm18, %zmm25, %zmm2 # zmm2 = zmm25[2],zmm18[2],zmm25[3],zmm18[3],zmm25[6],zmm18[6],zmm25[7],zmm18[7],zmm25[10],zmm18[10],zmm25[11],zmm18[11],zmm25[14],zmm18[14],zmm25[15],zmm18[15] | |
| vmovupd 944(%rsp), %zmm25 # 64-byte Reload | |
| vmovupd 1136(%rsp), %zmm18 # 64-byte Reload | |
| vpermi2pd %zmm2, %zmm26, %zmm11 | |
| vmovapd %zmm5, %zmm3 {%k1} | |
| vmovaps %zmm12, %zmm5 | |
| vpermt2ps %zmm24, %zmm21, %zmm5 | |
| vpermi2ps %zmm16, %zmm28, %zmm21 | |
| vshufpd $128, %zmm2, %zmm26, %zmm25 {%k1} # zmm25 {%k1} = zmm26[0],zmm2[0],zmm26[2],zmm2[2],zmm26[4],zmm2[4],zmm26[6],zmm2[7] | |
| vshuff64x2 $170, %zmm18, %zmm18, %zmm13 # zmm13 = zmm18[4,5,4,5,4,5,4,5] | |
| vextractf32x4 $2, %zmm5, %xmm2 | |
| vblendps $3, %xmm8, %xmm2, %xmm2 # xmm2 = xmm8[0,1],xmm2[2,3] | |
| vshuff64x2 $170, %zmm21, %zmm21, %zmm8 # zmm8 = zmm21[4,5,4,5,4,5,4,5] | |
| vblendpd $8, %ymm8, %ymm13, %ymm8 # ymm8 = ymm13[0,1,2],ymm8[3] | |
| vmovupd 1776(%rsp), %zmm13 # 64-byte Reload | |
| vblendpd $3, %ymm2, %ymm8, %ymm2 # ymm2 = ymm2[0,1],ymm8[2,3] | |
| vpmovm2d %k6, %zmm8 | |
| vpbroadcastd %xmm8, %zmm8 | |
| vinsertf64x4 $0, %ymm2, %zmm3, %zmm2 | |
| vmovdqa 1408(%rsp), %xmm3 # 16-byte Reload | |
| vpmovd2m %zmm8, %k6 | |
| vmovups %zmm2, (%rcx,%rax,4) {%k6} | |
| vmovapd %zmm11, %zmm13 {%k1} | |
| vunpckhps %zmm16, %zmm28, %zmm11 # zmm11 = zmm28[2],zmm16[2],zmm28[3],zmm16[3],zmm28[6],zmm16[6],zmm28[7],zmm16[7],zmm28[10],zmm16[10],zmm28[11],zmm16[11],zmm28[14],zmm16[14],zmm28[15],zmm16[15] | |
| kshiftrq $48, %k5, %k1 | |
| vshuff64x2 $170, %zmm11, %zmm11, %zmm8 # zmm8 = zmm11[4,5,4,5,4,5,4,5] | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpbroadcastd %xmm3, %zmm2 | |
| vpmulld %xmm10, %xmm3, %xmm3 | |
| .loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
| vpcmpgtd %zmm2, %zmm4, %k6 | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpmulld %xmm10, %xmm2, %xmm2 | |
| vpextrd $3, %xmm3, %ecx | |
| vpextrd $3, %xmm2, %edi | |
| vextracti32x4 $3, %zmm20, %xmm2 | |
| vmovups 1008(%rsp), %zmm20 # 64-byte Reload | |
| .loc 1 236 21 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:21 | |
| movslq %ecx, %rcx | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpmulld %xmm10, %xmm2, %xmm2 | |
| .loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
| movslq %edi, %rdi | |
| leaq (%r13,%rcx,4), %rcx | |
| leaq (%r13,%rdi,4), %rdi | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpextrd $3, %xmm2, %esi | |
| vextracti32x4 $3, %zmm6, %xmm2 | |
| .loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21 | |
| vshuff64x2 $170, %zmm1, %zmm1, %zmm6 # zmm6 = zmm1[4,5,4,5,4,5,4,5] | |
| vextractf32x4 $3, %zmm0, %xmm0 | |
| vextractf64x4 $1, %zmm1, %ymm1 | |
| .loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpmulld %xmm10, %xmm2, %xmm2 | |
| .loc 1 236 21 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:21 | |
| movslq %esi, %rsi | |
| .loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21 | |
| vblendpd $8, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0,1,2],ymm8[3] | |
| .loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
| leaq (%r13,%rsi,4), %rsi | |
| .loc 1 236 33 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:33 | |
| vpextrd $3, %xmm2, %edx | |
| .loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21 | |
| vunpckhps %zmm24, %zmm12, %zmm2 # zmm2 = zmm12[2],zmm24[2],zmm12[3],zmm24[3],zmm12[6],zmm24[6],zmm12[7],zmm24[7],zmm12[10],zmm24[10],zmm12[11],zmm24[11],zmm12[14],zmm24[14],zmm12[15],zmm24[15] | |
| vextractf32x4 $2, %zmm20, %xmm3 | |
| vextractf32x4 $2, %zmm2, %xmm4 | |
| .loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
| movslq %edx, %rdx | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vextractf32x4 $3, %zmm2, %xmm2 | |
| .loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
| leaq (%r13,%rdx,4), %rdx | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vblendps $3, %xmm3, %xmm4, %xmm4 # xmm4 = xmm3[0,1],xmm4[2,3] | |
| vblendpd $3, %ymm4, %ymm6, %ymm4 # ymm4 = ymm4[0,1],ymm6[2,3] | |
| vpmovm2d %k1, %zmm6 | |
| vpbroadcastd %xmm6, %zmm6 | |
| vinsertf64x4 $0, %ymm4, %zmm13, %zmm4 | |
| vpmovd2m %zmm6, %k1 | |
| vextractf32x4 $3, %zmm22, %xmm6 | |
| vmovups %zmm4, (%r8,%rax,4) {%k1} | |
| vextractf32x4 $3, %zmm7, %xmm4 | |
| .loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
| kunpckwd %k6, %k2, %k1 | |
| kunpckwd %k3, %k4, %k2 | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vextractf64x4 $1, %zmm27, %ymm7 | |
| .loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
| kunpckdq %k1, %k2, %k1 | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vblendps $3, %xmm6, %xmm4, %xmm4 # xmm4 = xmm6[0,1],xmm4[2,3] | |
| vextractf64x4 $1, %zmm15, %ymm6 | |
| .loc 1 238 39 # 03-matrix-multiplication-cpu.py:238:39 | |
| kshiftrq $15, %k1, %k1 | |
| kandq %k0, %k1, %k0 | |
| .loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
| vblendpd $8, %ymm6, %ymm7, %ymm6 # ymm6 = ymm7[0,1,2],ymm6[3] | |
| vblendpd $3, %ymm4, %ymm6, %ymm4 # ymm4 = ymm4[0,1],ymm6[2,3] | |
| vpmovm2d %k0, %zmm6 | |
| vpbroadcastd %xmm6, %zmm6 | |
| vinsertf64x4 $0, %ymm4, %zmm29, %zmm4 | |
| vpmovd2m %zmm6, %k1 | |
| vextractf64x4 $1, %zmm9, %ymm6 | |
| vmovups %zmm4, (%rdi,%rax,4) {%k1} | |
| vextractf32x4 $3, %zmm17, %xmm4 | |
| kshiftrq $16, %k0, %k1 | |
| vblendps $3, %xmm4, %xmm0, %xmm0 # xmm0 = xmm4[0,1],xmm0[2,3] | |
| vextractf64x4 $1, %zmm19, %ymm4 | |
| vblendpd $8, %ymm6, %ymm4, %ymm4 # ymm4 = ymm4[0,1,2],ymm6[3] | |
| vblendpd $3, %ymm0, %ymm4, %ymm0 # ymm0 = ymm0[0,1],ymm4[2,3] | |
| vpmovm2d %k1, %zmm4 | |
| vpbroadcastd %xmm4, %zmm4 | |
| vinsertf64x4 $0, %ymm0, %zmm23, %zmm0 | |
| vpmovd2m %zmm4, %k1 | |
| vextractf32x4 $3, %zmm14, %xmm4 | |
| vmovups %zmm0, (%rsi,%rax,4) {%k1} | |
| vextractf32x4 $3, %zmm5, %xmm0 | |
| vextractf64x4 $1, %zmm18, %ymm5 | |
| kshiftrq $32, %k0, %k1 | |
| kshiftrq $48, %k0, %k0 | |
| vblendps $3, %xmm4, %xmm0, %xmm0 # xmm0 = xmm4[0,1],xmm0[2,3] | |
| vextractf64x4 $1, %zmm21, %ymm4 | |
| vblendpd $8, %ymm4, %ymm5, %ymm4 # ymm4 = ymm5[0,1,2],ymm4[3] | |
| vblendpd $3, %ymm0, %ymm4, %ymm0 # ymm0 = ymm0[0,1],ymm4[2,3] | |
| vpmovm2d %k1, %zmm4 | |
| vpbroadcastd %xmm4, %zmm4 | |
| vinsertf64x4 $0, %ymm0, %zmm30, %zmm0 | |
| vpmovd2m %zmm4, %k1 | |
| vmovups %zmm0, (%rdx,%rax,4) {%k1} | |
| vextractf32x4 $3, %zmm20, %xmm0 | |
| vblendps $3, %xmm0, %xmm2, %xmm0 # xmm0 = xmm0[0,1],xmm2[2,3] | |
| vextractf64x4 $1, %zmm11, %ymm2 | |
| vblendps $192, %ymm2, %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] | |
| vblendps $15, %ymm0, %ymm1, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] | |
| vpmovm2d %k0, %zmm1 | |
| vpbroadcastd %xmm1, %zmm1 | |
| vinsertf64x4 $0, %ymm0, %zmm25, %zmm0 | |
| vpmovd2m %zmm1, %k1 | |
| vmovups %zmm0, (%rcx,%rax,4) {%k1} | |
| .loc 1 239 4 epilogue_begin is_stmt 0 # 03-matrix-multiplication-cpu.py:239:4 | |
| addq $3448, %rsp # imm = 0xD78 | |
| .cfi_def_cfa_offset 56 | |
| popq %rbx | |
| .cfi_def_cfa_offset 48 | |
| popq %r12 | |
| .cfi_def_cfa_offset 40 | |
| popq %r13 | |
| .cfi_def_cfa_offset 32 | |
| popq %r14 | |
| .cfi_def_cfa_offset 24 | |
| popq %r15 | |
| .cfi_def_cfa_offset 16 | |
| popq %rbp | |
| .cfi_def_cfa_offset 8 | |
| vzeroupper | |
| retq | |
| .Ltmp12: | |
| .Lfunc_end0: | |
| .size matmul_kernel, .Lfunc_end0-matmul_kernel | |
| .cfi_endproc | |
| # -- End function | |
| .section .debug_abbrev,"",@progbits | |
| .byte 1 # Abbreviation Code | |
| .byte 17 # DW_TAG_compile_unit | |
| .byte 1 # DW_CHILDREN_yes | |
| .byte 37 # DW_AT_producer | |
| .byte 14 # DW_FORM_strp | |
| .byte 19 # DW_AT_language | |
| .byte 5 # DW_FORM_data2 | |
| .byte 3 # DW_AT_name | |
| .byte 14 # DW_FORM_strp | |
| .byte 16 # DW_AT_stmt_list | |
| .byte 23 # DW_FORM_sec_offset | |
| .byte 27 # DW_AT_comp_dir | |
| .byte 14 # DW_FORM_strp | |
| .byte 17 # DW_AT_low_pc | |
| .byte 1 # DW_FORM_addr | |
| .byte 18 # DW_AT_high_pc | |
| .byte 6 # DW_FORM_data4 | |
| .byte 0 # EOM(1) | |
| .byte 0 # EOM(2) | |
| .byte 2 # Abbreviation Code | |
| .byte 46 # DW_TAG_subprogram | |
| .byte 0 # DW_CHILDREN_no | |
| .byte 3 # DW_AT_name | |
| .byte 14 # DW_FORM_strp | |
| .byte 32 # DW_AT_inline | |
| .byte 11 # DW_FORM_data1 | |
| .byte 0 # EOM(1) | |
| .byte 0 # EOM(2) | |
| .byte 3 # Abbreviation Code | |
| .byte 46 # DW_TAG_subprogram | |
| .byte 1 # DW_CHILDREN_yes | |
| .byte 17 # DW_AT_low_pc | |
| .byte 1 # DW_FORM_addr | |
| .byte 18 # DW_AT_high_pc | |
| .byte 6 # DW_FORM_data4 | |
| .byte 49 # DW_AT_abstract_origin | |
| .byte 19 # DW_FORM_ref4 | |
| .byte 0 # EOM(1) | |
| .byte 0 # EOM(2) | |
| .byte 4 # Abbreviation Code | |
| .byte 29 # DW_TAG_inlined_subroutine | |
| .byte 0 # DW_CHILDREN_no | |
| .byte 49 # DW_AT_abstract_origin | |
| .byte 19 # DW_FORM_ref4 | |
| .byte 85 # DW_AT_ranges | |
| .byte 23 # DW_FORM_sec_offset | |
| .byte 88 # DW_AT_call_file | |
| .byte 11 # DW_FORM_data1 | |
| .byte 89 # DW_AT_call_line | |
| .byte 11 # DW_FORM_data1 | |
| .byte 87 # DW_AT_call_column | |
| .byte 11 # DW_FORM_data1 | |
| .byte 0 # EOM(1) | |
| .byte 0 # EOM(2) | |
| .byte 0 # EOM(3) | |
| .section .debug_info,"",@progbits | |
| .Lcu_begin0: | |
| .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit | |
| .Ldebug_info_start0: | |
| .short 4 # DWARF version number | |
| .long .debug_abbrev # Offset Into Abbrev. Section | |
| .byte 8 # Address Size (in bytes) | |
| .byte 1 # Abbrev [1] 0xb:0x5c DW_TAG_compile_unit | |
| .long .Linfo_string0 # DW_AT_producer | |
| .short 2 # DW_AT_language | |
| .long .Linfo_string1 # DW_AT_name | |
| .long .Lline_table_start0 # DW_AT_stmt_list | |
| .long .Linfo_string2 # DW_AT_comp_dir | |
| .quad .Lfunc_begin0 # DW_AT_low_pc | |
| .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc | |
| .byte 2 # Abbrev [2] 0x2a:0x6 DW_TAG_subprogram | |
| .long .Linfo_string3 # DW_AT_name | |
| .byte 1 # DW_AT_inline | |
| .byte 3 # Abbrev [3] 0x30:0x36 DW_TAG_subprogram | |
| .quad .Lfunc_begin0 # DW_AT_low_pc | |
| .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc | |
| .long 42 # DW_AT_abstract_origin | |
| .byte 4 # Abbrev [4] 0x41:0xc DW_TAG_inlined_subroutine | |
| .long 42 # DW_AT_abstract_origin | |
| .long .Ldebug_ranges0 # DW_AT_ranges | |
| .byte 1 # DW_AT_call_file | |
| .byte 189 # DW_AT_call_line | |
| .byte 27 # DW_AT_call_column | |
| .byte 4 # Abbrev [4] 0x4d:0xc DW_TAG_inlined_subroutine | |
| .long 42 # DW_AT_abstract_origin | |
| .long .Ldebug_ranges1 # DW_AT_ranges | |
| .byte 1 # DW_AT_call_file | |
| .byte 190 # DW_AT_call_line | |
| .byte 27 # DW_AT_call_column | |
| .byte 4 # Abbrev [4] 0x59:0xc DW_TAG_inlined_subroutine | |
| .long 42 # DW_AT_abstract_origin | |
| .long .Ldebug_ranges2 # DW_AT_ranges | |
| .byte 1 # DW_AT_call_file | |
| .byte 217 # DW_AT_call_line | |
| .byte 33 # DW_AT_call_column | |
| .byte 0 # End Of Children Mark | |
| .byte 0 # End Of Children Mark | |
| .Ldebug_info_end0: | |
| .section .debug_ranges,"",@progbits | |
| .Ldebug_ranges0: | |
| .quad .Ltmp0-.Lfunc_begin0 | |
| .quad .Ltmp1-.Lfunc_begin0 | |
| .quad .Ltmp2-.Lfunc_begin0 | |
| .quad .Ltmp3-.Lfunc_begin0 | |
| .quad .Ltmp4-.Lfunc_begin0 | |
| .quad .Ltmp5-.Lfunc_begin0 | |
| .quad 0 | |
| .quad 0 | |
| .Ldebug_ranges1: | |
| .quad .Ltmp1-.Lfunc_begin0 | |
| .quad .Ltmp2-.Lfunc_begin0 | |
| .quad .Ltmp3-.Lfunc_begin0 | |
| .quad .Ltmp4-.Lfunc_begin0 | |
| .quad .Ltmp5-.Lfunc_begin0 | |
| .quad .Ltmp6-.Lfunc_begin0 | |
| .quad 0 | |
| .quad 0 | |
| .Ldebug_ranges2: | |
| .quad .Ltmp7-.Lfunc_begin0 | |
| .quad .Ltmp8-.Lfunc_begin0 | |
| .quad .Ltmp9-.Lfunc_begin0 | |
| .quad .Ltmp10-.Lfunc_begin0 | |
| .quad 0 | |
| .quad 0 | |
| .section .debug_str,"MS",@progbits,1 | |
| .Linfo_string0: | |
| .asciz "triton" # string offset=0 | |
| .Linfo_string1: | |
| .asciz "03-matrix-multiplication-cpu.py" # string offset=7 | |
| .Linfo_string2: | |
| .asciz "/data/users/minjang/triton-oss/triton-cpu/python/tutorials" # string offset=39 | |
| .Linfo_string3: | |
| .asciz "matmul_kernel" # string offset=98 | |
| .section ".note.GNU-stack","",@progbits | |
| .section .debug_line,"",@progbits | |
| .Lline_table_start0: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment