Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save pashu123/d012c905e616495af0b52312635b0a48 to your computer and use it in GitHub Desktop.
Save pashu123/d012c905e616495af0b52312635b0a48 to your computer and use it in GitHub Desktop.
.text
.intel_syntax noprefix
.file "mmt3d_kernel_linked_llvm_cpu"
.section .text.turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32,"ax",@progbits
.p2align 4, 0x90
.type turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32,@function
turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32:
.Lfunc_begin0:
.file 1 "-"
.loc 1 1 0
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset rbp, -16
mov rbp, rsp
.cfi_def_cfa_register rbp
.Ltmp0:
push r15
push r14
push r13
push r12
push rbx
sub rsp, 184
.cfi_offset rbx, -56
.cfi_offset r12, -48
.cfi_offset r13, -40
.cfi_offset r14, -32
.cfi_offset r15, -24
.loc 1 4 3 prologue_end
mov rdi, qword ptr [rsi + 24]
mov ecx, dword ptr [rdi + 4]
mov eax, dword ptr [rdi]
mov r12d, dword ptr [rdi + 12]
mov r8, rcx
shl r8, 32
lea r10, [r8 + rax]
lea r8, [r8 + rax - 1]
mov r9, r10
neg r9
test r10, r10
mov qword ptr [rbp - 248], r10
cmovle r8, r9
lea r15, [r8 + 15]
test r8, r8
cmovns r15, r8
sar r15, 4
mov r8, r15
neg r8
inc r15
test r10, r10
mov r10d, dword ptr [rdi + 8]
cmovle r15, r8
movzx r8d, word ptr [rdx + 8]
shl r12, 32
lea r9, [r12 + r10]
mov r11d, r8d
shl r11d, 6
mov qword ptr [rbp - 144], r9
mov qword ptr [rbp - 64], r11
cmp r11, r9
jge .LBB0_17
.loc 1 0 3 is_stmt 0
mov r9, qword ptr [rsi + 32]
mov rdi, qword ptr [rdi + 16]
mov qword ptr [rbp - 112], rcx
.loc 1 4 3
or r12, r10
mov r13d, dword ptr [rsi + 12]
sub r12, qword ptr [rbp - 64]
mov qword ptr [rbp - 240], r15
mov qword ptr [rbp - 120], r9
mov r11, qword ptr [r9 + 8]
mov r9, rcx
mov ecx, dword ptr [rdx]
mov edx, dword ptr [rdx + 4]
mov r10, rdi
imul r10, r8
imul r10, r10, 13107200
imul rbx, rdx, 819200
mov r14, rcx
shl r14, 13
mov qword ptr [rbp - 232], rcx
add r14, rbx
add r14, r10
lea r10, [r11 + r14 + 960]
movabs r11, 13743895347200
imul r14, rax, 3200
imul rax, rax, 12800
imul r11, r9
mov qword ptr [rbp - 136], r10
mov r10d, dword ptr [rsi + 16]
movzx esi, word ptr [rsi + 20]
add r14, r11
mov r11, rcx
shl r11, 9
shl rcx, 7
add r11, rbx
mov rbx, qword ptr [rbp - 120]
imul r8, r14
mov qword ptr [rbp - 208], rcx
mov rcx, r15
imul r14, rsi
shl r8, 8
add r11, r8
imul r8, rdi, 204800
imul rdi, rsi
shl esi, 6
mov r9, qword ptr [rbx]
mov qword ptr [rbp - 184], rsi
lea rsi, [4*rdx]
shl r14, 8
mov qword ptr [rbp - 192], r14
mov qword ptr [rbp - 320], r8
movabs r8, 54975581388800
imul r8, qword ptr [rbp - 112]
sub rcx, rsi
mov qword ptr [rbp - 128], rsi
mov qword ptr [rbp - 168], rcx
lea rcx, [4*r10]
lea rdx, [r9 + r11 + 192000]
mov qword ptr [rbp - 224], rcx
mov rcx, r13
shl rcx, 13
mov qword ptr [rbp - 272], rcx
mov qword ptr [rbp - 72], rdx
mov rdx, r13
shl rdx, 7
shl r13, 9
mov qword ptr [rbp - 280], rdx
imul rdx, rdi, 13107200
mov qword ptr [rbp - 288], r13
add rax, r8
mov qword ptr [rbp - 312], rax
mov qword ptr [rbp - 176], rdx
imul rdx, r10, 819200
mov qword ptr [rbp - 216], rdx
jmp .LBB0_2
.p2align 4, 0x90
.LBB0_16:
.loc 1 0 3
mov rsi, qword ptr [rbp - 176]
mov rdx, qword ptr [rbp - 72]
mov rax, qword ptr [rbp - 64]
mov rcx, qword ptr [rbp - 184]
mov r12, qword ptr [rbp - 200]
.loc 1 4 3
add rdx, qword ptr [rbp - 192]
add qword ptr [rbp - 136], rsi
add rax, rcx
sub r12, rcx
mov qword ptr [rbp - 72], rdx
mov qword ptr [rbp - 64], rax
cmp rax, qword ptr [rbp - 144]
jge .LBB0_17
.LBB0_2:
cmp r12, 64
mov ecx, 64
mov eax, 1
mov qword ptr [rbp - 200], r12
cmovl rcx, r12
cmp rcx, 2
cmovl rcx, rax
mov qword ptr [rbp - 328], rcx
cmp r15, qword ptr [rbp - 128]
jle .LBB0_16
.loc 1 0 3
mov rdx, qword ptr [rbp - 72]
mov rcx, qword ptr [rbp - 136]
mov rax, qword ptr [rbp - 144]
.loc 1 4 3
sub rax, qword ptr [rbp - 64]
mov qword ptr [rbp - 160], rdx
mov qword ptr [rbp - 152], rcx
mov rcx, qword ptr [rbp - 168]
mov rdx, qword ptr [rbp - 128]
mov qword ptr [rbp - 296], rax
jmp .LBB0_4
.p2align 4, 0x90
.LBB0_15:
.loc 1 0 3
mov rsi, qword ptr [rbp - 216]
mov rax, qword ptr [rbp - 224]
mov rdx, qword ptr [rbp - 256]
mov rcx, qword ptr [rbp - 264]
mov r15, qword ptr [rbp - 240]
.loc 1 4 3
add qword ptr [rbp - 152], rsi
add qword ptr [rbp - 160], rsi
add rdx, rax
sub rcx, rax
cmp rdx, r15
jge .LBB0_16
.LBB0_4:
cmp rcx, 4
mov esi, 4
mov rdi, qword ptr [rbp - 248]
mov eax, 1
mov qword ptr [rbp - 256], rdx
mov qword ptr [rbp - 264], rcx
cmovl rsi, rcx
cmp rsi, 2
cmovl rsi, rax
sub r15, rdx
mov eax, 4
cmp r15, 4
mov qword ptr [rbp - 352], rsi
mov qword ptr [rbp - 336], r15
cmovl rax, r15
shl rdx, 4
shl rax, 4
sub rdi, rdx
cmp rax, rdi
cmovl rdi, rax
cmp dword ptr [rbp - 232], 24
ja .LBB0_15
.loc 1 0 3
mov rcx, qword ptr [rbp - 152]
mov rax, qword ptr [rbp - 160]
mov qword ptr [rbp - 80], rcx
mov rcx, qword ptr [rbp - 208]
mov qword ptr [rbp - 88], rax
jmp .LBB0_6
.p2align 4, 0x90
.LBB0_14:
mov rcx, qword ptr [rbp - 304]
mov rax, qword ptr [rbp - 80]
mov rdx, qword ptr [rbp - 88]
.loc 1 4 3
add rax, qword ptr [rbp - 272]
add rdx, qword ptr [rbp - 288]
add rcx, qword ptr [rbp - 280]
mov qword ptr [rbp - 80], rax
mov qword ptr [rbp - 88], rdx
cmp rcx, 3200
jge .LBB0_15
.LBB0_6:
.loc 1 0 3
cmp qword ptr [rbp - 296], 0
mov qword ptr [rbp - 304], rcx
.loc 1 4 3
jle .LBB0_14
.loc 1 0 3
mov rax, qword ptr [rbp - 88]
mov rdx, qword ptr [rbp - 80]
xor ecx, ecx
mov qword ptr [rbp - 104], rax
mov qword ptr [rbp - 96], rdx
jmp .LBB0_8
.p2align 4, 0x90
.LBB0_13:
mov rcx, qword ptr [rbp - 344]
mov rax, qword ptr [rbp - 96]
mov rdx, qword ptr [rbp - 104]
.loc 1 4 3
add rax, qword ptr [rbp - 320]
add rdx, qword ptr [rbp - 312]
inc rcx
mov qword ptr [rbp - 96], rax
mov qword ptr [rbp - 104], rdx
cmp rcx, qword ptr [rbp - 328]
je .LBB0_14
.LBB0_8:
.loc 1 0 3
cmp qword ptr [rbp - 336], 0
mov qword ptr [rbp - 344], rcx
.loc 1 4 3
jle .LBB0_13
.loc 1 0 3
mov r13, qword ptr [rbp - 104]
mov r12, qword ptr [rbp - 96]
xor r8d, r8d
.p2align 4, 0x90
.LBB0_10:
.loc 1 4 3
mov rax, r8
shl rax, 4
mov r14, rdi
mov rdx, r12
sub r14, rax
xor eax, eax
test r14, r14
setg al
neg eax
kmovd k1, eax
xor eax, eax
cmp r14, 2
setge al
kmovw word ptr [rbp - 120], k1
neg eax
kmovd k1, eax
xor eax, eax
cmp r14, 3
setge al
kmovw word ptr [rbp - 112], k1
neg eax
kmovd k1, eax
xor eax, eax
cmp r14, 4
setge al
kmovw word ptr [rbp - 56], k1
neg eax
kmovd k1, eax
xor eax, eax
cmp r14, 5
setge al
kmovw word ptr [rbp - 54], k1
neg eax
kmovd k1, eax
xor eax, eax
cmp r14, 6
setge al
kmovw word ptr [rbp - 52], k1
neg eax
kmovd k1, eax
xor eax, eax
cmp r14, 7
setge al
kmovw word ptr [rbp - 50], k1
neg eax
kmovd k1, eax
xor eax, eax
cmp r14, 8
setge al
xor esi, esi
kmovw word ptr [rbp - 48], k1
neg eax
cmp r14, 9
setge sil
xor ebx, ebx
kmovd k1, eax
neg esi
cmp r14, 10
kmovw word ptr [rbp - 46], k1
setge bl
xor r9d, r9d
kmovd k1, esi
neg ebx
cmp r14, 11
kmovw word ptr [rbp - 44], k1
setge r9b
xor r15d, r15d
kmovd k1, ebx
neg r9d
cmp r14, 12
kmovw word ptr [rbp - 42], k1
setge r15b
xor r11d, r11d
kmovd k4, r9d
neg r15d
cmp r14, 13
setge r11b
xor eax, eax
kmovd k5, r15d
neg r11d
cmp r14, 14
setge al
xor r10d, r10d
kmovd k6, r11d
neg eax
cmp r14, 15
setge r10b
xor ecx, ecx
kmovd k7, eax
neg r10d
cmp r14, 16
mov r14, -16
setge cl
kmovd k1, r10d
neg ecx
kmovd k2, ecx
.p2align 4, 0x90
.LBB0_11:
.loc 1 0 3
kmovw k3, word ptr [rbp - 120]
.loc 1 4 3
vmovups zmm10 {k4} {z}, zmmword ptr [r13 + 4*r14 - 63936]
vmovups zmm11 {k5} {z}, zmmword ptr [r13 + 4*r14 - 51136]
vmovups zmm12 {k6} {z}, zmmword ptr [r13 + 4*r14 - 38336]
vmovups zmm13 {k7} {z}, zmmword ptr [r13 + 4*r14 - 25536]
vmovups zmm14 {k1} {z}, zmmword ptr [r13 + 4*r14 - 12736]
vmovups zmm15 {k2} {z}, zmmword ptr [r13 + 4*r14 + 64]
vmovups zmm0 {k3} {z}, zmmword ptr [r13 + 4*r14 - 191936]
kmovw k3, word ptr [rbp - 112]
vunpcklps zmm17, zmm10, zmm11
vunpckhps zmm10, zmm10, zmm11
vunpcklps zmm11, zmm12, zmm13
vunpckhps zmm12, zmm12, zmm13
vunpcklps zmm13, zmm14, zmm15
vunpckhps zmm14, zmm14, zmm15
vunpcklpd zmm20, zmm11, zmm13
vunpckhpd zmm11, zmm11, zmm13
vunpcklpd zmm13, zmm12, zmm14
vunpckhpd zmm12, zmm12, zmm14
vmovups zmm1 {k3} {z}, zmmword ptr [r13 + 4*r14 - 179136]
kmovw k3, word ptr [rbp - 56]
vmovups zmm2 {k3} {z}, zmmword ptr [r13 + 4*r14 - 166336]
kmovw k3, word ptr [rbp - 54]
vunpcklps zmm16, zmm0, zmm1
vunpckhps zmm0, zmm0, zmm1
vmovups zmm3 {k3} {z}, zmmword ptr [r13 + 4*r14 - 153536]
kmovw k3, word ptr [rbp - 52]
vmovups zmm4 {k3} {z}, zmmword ptr [r13 + 4*r14 - 140736]
kmovw k3, word ptr [rbp - 50]
vunpcklps zmm1, zmm2, zmm3
vunpckhps zmm2, zmm2, zmm3
vunpcklpd zmm15, zmm16, zmm1
vunpckhpd zmm1, zmm16, zmm1
vunpcklpd zmm16, zmm0, zmm2
vunpckhpd zmm0, zmm0, zmm2
vmovups zmm5 {k3} {z}, zmmword ptr [r13 + 4*r14 - 127936]
kmovw k3, word ptr [rbp - 48]
vmovups zmm6 {k3} {z}, zmmword ptr [r13 + 4*r14 - 115136]
kmovw k3, word ptr [rbp - 46]
vunpcklps zmm3, zmm4, zmm5
vunpckhps zmm4, zmm4, zmm5
vmovups zmm7 {k3} {z}, zmmword ptr [r13 + 4*r14 - 102336]
kmovw k3, word ptr [rbp - 44]
vmovups zmm8 {k3} {z}, zmmword ptr [r13 + 4*r14 - 89536]
kmovw k3, word ptr [rbp - 42]
vunpcklps zmm5, zmm6, zmm7
vunpckhps zmm6, zmm6, zmm7
vunpcklpd zmm2, zmm3, zmm5
vunpckhpd zmm3, zmm3, zmm5
vunpcklpd zmm5, zmm4, zmm6
vunpckhpd zmm4, zmm4, zmm6
vmovups zmm9 {k3} {z}, zmmword ptr [r13 + 4*r14 - 76736]
vshuff64x2 zmm14, zmm15, zmm2, 136
vshuff64x2 zmm19, zmm0, zmm4, 136
vshuff64x2 zmm18, zmm16, zmm5, 136
vshuff64x2 zmm2, zmm15, zmm2, 221
vshuff64x2 zmm0, zmm0, zmm4, 221
add r14, 16
vunpcklps zmm7, zmm8, zmm9
vunpckhps zmm8, zmm8, zmm9
vunpcklpd zmm21, zmm8, zmm10
vunpcklpd zmm6, zmm7, zmm17
vunpckhpd zmm7, zmm7, zmm17
vunpckhpd zmm8, zmm8, zmm10
vshuff64x2 zmm17, zmm1, zmm3, 136
vshuff64x2 zmm1, zmm1, zmm3, 221
vshuff64x2 zmm3, zmm16, zmm5, 221
vshuff64x2 zmm4, zmm6, zmm20, 136
vshuff64x2 zmm6, zmm6, zmm20, 221
vshuff64x2 zmm5, zmm7, zmm11, 136
vshuff64x2 zmm16, zmm8, zmm12, 136
vshuff64x2 zmm7, zmm7, zmm11, 221
vshuff64x2 zmm9, zmm21, zmm13, 221
vshuff64x2 zmm15, zmm21, zmm13, 136
vshuff64x2 zmm8, zmm8, zmm12, 221
vshuff64x2 zmm10, zmm14, zmm4, 136
vshuff64x2 zmm11, zmm17, zmm5, 136
vshuff64x2 zmm20, zmm2, zmm6, 136
vshuff64x2 zmm21, zmm1, zmm7, 136
vshuff64x2 zmm22, zmm3, zmm9, 136
vshuff64x2 zmm23, zmm0, zmm8, 136
vshuff64x2 zmm4, zmm14, zmm4, 221
vshuff64x2 zmm12, zmm18, zmm15, 136
vshuff64x2 zmm14, zmm18, zmm15, 221
vshuff64x2 zmm13, zmm19, zmm16, 136
vshuff64x2 zmm5, zmm17, zmm5, 221
vshuff64x2 zmm15, zmm19, zmm16, 221
vshuff64x2 zmm2, zmm2, zmm6, 221
vshuff64x2 zmm1, zmm1, zmm7, 221
vshuff64x2 zmm3, zmm3, zmm9, 221
vshuff64x2 zmm0, zmm0, zmm8, 221
vmovapd zmmword ptr [rdx - 960], zmm10
vmovapd zmmword ptr [rdx - 896], zmm11
vmovapd zmmword ptr [rdx - 832], zmm12
vmovapd zmmword ptr [rdx - 768], zmm13
vmovapd zmmword ptr [rdx - 704], zmm20
vmovapd zmmword ptr [rdx - 640], zmm21
vmovapd zmmword ptr [rdx - 576], zmm22
vmovapd zmmword ptr [rdx - 512], zmm23
vmovapd zmmword ptr [rdx - 448], zmm4
vmovapd zmmword ptr [rdx - 384], zmm5
vmovapd zmmword ptr [rdx - 320], zmm14
vmovapd zmmword ptr [rdx - 256], zmm15
vmovapd zmmword ptr [rdx - 192], zmm2
vmovapd zmmword ptr [rdx - 128], zmm1
vmovapd zmmword ptr [rdx - 64], zmm3
vmovapd zmmword ptr [rdx], zmm0
add rdx, 1024
cmp r14, 112
jb .LBB0_11
inc r8
add r12, 204800
add r13, 204800
cmp r8, qword ptr [rbp - 352]
jne .LBB0_10
jmp .LBB0_13
.LBB0_17:
xor eax, eax
.loc 1 4 3 epilogue_begin
add rsp, 184
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
.cfi_def_cfa rsp, 8
vzeroupper
ret
.Ltmp1:
.Lfunc_end0:
.size turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32, .Lfunc_end0-turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32
.cfi_endproc
.section .rodata.cst16,"aM",@progbits,16
.p2align 4, 0x0
.LCPI1_0:
.short 0
.short 1
.short 2
.short 26
.short 2
.short 26
.short 3
.short 27
.LCPI1_1:
.short 0
.short 1
.short 2
.short 25
.short 2
.short 0
.short 0
.short 24
.LCPI1_6:
.short 0
.short 1
.short 2
.short 9
.short 2
.short 0
.short 0
.short 8
.LCPI1_15:
.short 0
.short 1
.short 2
.short 10
.short 2
.short 10
.short 3
.short 11
.LCPI1_95:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 16
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_96:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 16
.byte 0
.byte 0
.LCPI1_97:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 17
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_98:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 17
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_99:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 16
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_101:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 17
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_103:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 20
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_104:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 17
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_105:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 20
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_107:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 18
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_108:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 18
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_109:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 18
.byte 0
.byte 0
.LCPI1_111:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 20
.byte 0
.byte 0
.LCPI1_112:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 22
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_113:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 22
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_114:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 22
.byte 0
.byte 0
.LCPI1_115:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 24
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_116:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 24
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_117:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 24
.byte 0
.byte 0
.LCPI1_118:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 17
.byte 0
.byte 0
.byte 0
.LCPI1_119:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 26
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_120:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 26
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_121:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 26
.byte 0
.byte 0
.LCPI1_122:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 14
.byte 16
.LCPI1_123:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 19
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_124:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 21
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_125:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 23
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_126:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 29
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_127:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 31
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_128:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 27
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_129:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 28
.byte 0
.byte 0
.LCPI1_130:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 28
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_131:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 28
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_132:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 29
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_133:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 29
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_134:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 29
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_135:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 29
.byte 0
.byte 0
.byte 0
.LCPI1_136:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 30
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_137:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 30
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_138:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 19
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_139:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 21
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_140:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 23
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_141:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 31
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_142:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 19
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_143:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 21
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_144:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 23
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_145:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 25
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_146:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 31
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_147:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 19
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_148:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 31
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_149:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 21
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_150:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 23
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_151:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 25
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI1_152:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 19
.byte 0
.byte 0
.byte 0
.LCPI1_153:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 21
.byte 0
.byte 0
.byte 0
.LCPI1_154:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 23
.byte 0
.byte 0
.byte 0
.LCPI1_155:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 25
.byte 0
.byte 0
.byte 0
.LCPI1_156:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 27
.byte 0
.byte 0
.byte 0
.LCPI1_157:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 31
.byte 0
.byte 0
.byte 0
.LCPI1_158:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 17
.byte 0
.byte 0
.LCPI1_159:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 17
.byte 0
.LCPI1_160:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 19
.byte 0
.byte 0
.LCPI1_161:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 21
.byte 0
.byte 0
.LCPI1_162:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 23
.byte 0
.byte 0
.LCPI1_163:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 25
.byte 0
.byte 0
.LCPI1_164:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 27
.byte 0
.byte 0
.LCPI1_165:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 30
.byte 0
.byte 0
.LCPI1_166:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 31
.byte 0
.byte 0
.LCPI1_167:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 19
.byte 0
.LCPI1_168:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 31
.byte 0
.LCPI1_169:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 21
.byte 0
.LCPI1_170:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 23
.byte 0
.LCPI1_171:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 25
.byte 0
.LCPI1_172:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 27
.byte 0
.LCPI1_173:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 29
.byte 0
.LCPI1_174:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 14
.byte 17
.LCPI1_175:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 14
.byte 18
.LCPI1_176:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 14
.byte 19
.LCPI1_177:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 14
.byte 20
.LCPI1_178:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 14
.byte 21
.LCPI1_179:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 14
.byte 22
.LCPI1_180:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 14
.byte 23
.LCPI1_181:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 14
.byte 24
.LCPI1_182:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 14
.byte 25
.LCPI1_183:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 14
.byte 26
.LCPI1_184:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 14
.byte 27
.LCPI1_185:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 14
.byte 28
.LCPI1_186:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 14
.byte 29
.LCPI1_187:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
.byte 8
.byte 9
.byte 10
.byte 11
.byte 12
.byte 13
.byte 14
.byte 30
.section .rodata.cst32,"aM",@progbits,32
.p2align 5, 0x0
.LCPI1_2:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 16
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_3:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 16
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_4:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 16
.zero 2
.zero 2
.LCPI1_5:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 14
.short 16
.LCPI1_7:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 17
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_8:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 17
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_9:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 17
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_10:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 17
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_11:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 17
.zero 2
.zero 2
.zero 2
.LCPI1_12:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 17
.zero 2
.zero 2
.LCPI1_13:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 17
.zero 2
.LCPI1_14:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 14
.short 17
.LCPI1_16:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 18
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_17:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 18
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_18:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 18
.zero 2
.zero 2
.LCPI1_19:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 14
.short 18
.LCPI1_20:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 19
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_21:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 19
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_22:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 19
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_23:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 19
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_24:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 19
.zero 2
.zero 2
.zero 2
.LCPI1_25:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 19
.zero 2
.zero 2
.LCPI1_26:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 19
.zero 2
.LCPI1_27:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 14
.short 19
.LCPI1_28:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 20
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_29:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 20
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_30:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 20
.zero 2
.zero 2
.LCPI1_31:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 14
.short 20
.LCPI1_32:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 21
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_33:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 21
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_34:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 21
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_35:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 21
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_36:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 21
.zero 2
.zero 2
.zero 2
.LCPI1_37:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 21
.zero 2
.zero 2
.LCPI1_38:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 21
.zero 2
.LCPI1_39:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 14
.short 21
.LCPI1_40:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 22
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_41:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 22
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_42:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 22
.zero 2
.zero 2
.LCPI1_43:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 14
.short 22
.LCPI1_44:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 23
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_45:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 23
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_46:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 23
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_47:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 23
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_48:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 23
.zero 2
.zero 2
.zero 2
.LCPI1_49:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 23
.zero 2
.zero 2
.LCPI1_50:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 23
.zero 2
.LCPI1_51:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 14
.short 23
.LCPI1_52:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 24
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_53:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 24
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_54:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 24
.zero 2
.zero 2
.LCPI1_55:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 14
.short 24
.LCPI1_56:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 25
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_57:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 25
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_58:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 25
.zero 2
.zero 2
.zero 2
.LCPI1_59:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 25
.zero 2
.zero 2
.LCPI1_60:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 25
.zero 2
.LCPI1_61:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 14
.short 25
.LCPI1_62:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 26
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_63:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 26
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_64:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 26
.zero 2
.zero 2
.LCPI1_65:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 14
.short 26
.LCPI1_67:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 27
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_68:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 27
.zero 2
.zero 2
.zero 2
.LCPI1_69:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 27
.zero 2
.zero 2
.LCPI1_70:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 27
.zero 2
.LCPI1_71:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 14
.short 27
.LCPI1_72:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 28
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_73:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 28
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_74:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 28
.zero 2
.zero 2
.LCPI1_75:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 14
.short 28
.LCPI1_76:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 29
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_77:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 29
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_78:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 29
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_79:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 29
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_80:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 29
.zero 2
.zero 2
.zero 2
.LCPI1_81:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 29
.zero 2
.LCPI1_82:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 14
.short 29
.LCPI1_83:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 30
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_84:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 30
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_85:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 30
.zero 2
.zero 2
.LCPI1_86:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 14
.short 30
.LCPI1_87:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 31
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_88:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 31
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_89:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 31
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_90:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 31
.zero 2
.zero 2
.zero 2
.zero 2
.LCPI1_91:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 31
.zero 2
.zero 2
.zero 2
.LCPI1_92:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 31
.zero 2
.zero 2
.LCPI1_93:
.short 0
.short 1
.short 2
.short 3
.short 4
.short 5
.short 6
.short 7
.short 8
.short 9
.short 10
.short 11
.short 12
.short 13
.short 31
.zero 2
.LCPI1_94:
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.short 11
.short 27
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.zero 2
.section .rodata.cst4,"aM",@progbits,4
.p2align 2, 0x0
.LCPI1_66:
.short 11
.short 27
.section .rodata.cst8,"aM",@progbits,8
.LCPI1_100:
.byte 0
.byte 1
.byte 2
.byte 26
.byte 2
.byte 26
.byte 3
.byte 27
.LCPI1_102:
.byte 0
.byte 1
.byte 2
.byte 9
.byte 2
.byte 0
.byte 0
.byte 8
.LCPI1_106:
.byte 0
.byte 1
.byte 2
.byte 10
.byte 2
.byte 10
.byte 3
.byte 11
.LCPI1_110:
.byte 0
.byte 1
.byte 2
.byte 25
.byte 2
.byte 0
.byte 0
.byte 24
.section .text.turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack,"ax",@progbits
.p2align 4, 0x90
.type turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack,@function
turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack:
.Lfunc_begin1:
.loc 1 1 0 is_stmt 1
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset rbp, -16
mov rbp, rsp
.cfi_def_cfa_register rbp
.Ltmp2:
push r15
push r14
push r13
push r12
push rbx
and rsp, -32
sub rsp, 1792
.cfi_offset rbx, -56
.cfi_offset r12, -48
.cfi_offset r13, -40
.cfi_offset r14, -32
.cfi_offset r15, -24
.loc 1 4 3 prologue_end
mov r8, qword ptr [rsi + 24]
movzx edi, word ptr [rdx + 8]
mov r14d, dword ptr [r8 + 12]
mov ecx, dword ptr [r8 + 8]
mov r9d, edi
shl r9d, 6
mov qword ptr [rsp + 16], r9
shl r14, 32
lea rax, [r14 + rcx]
mov qword ptr [rsp + 32], rax
cmp r9, rax
jge .LBB1_14
.loc 1 0 3 is_stmt 0
mov rbx, qword ptr [rsi + 32]
mov r9d, 15361
mov r11d, dword ptr [rsi + 12]
mov eax, dword ptr [rsi + 16]
movzx r10d, word ptr [rsi + 20]
mov r15d, 3538944000
.loc 1 4 3
or r14, rcx
vpmovsxbw ymm28, xmmword ptr [rip + .LCPI1_95]
vpmovsxbw ymm11, xmmword ptr [rip + .LCPI1_96]
vpmovsxbw ymm26, xmmword ptr [rip + .LCPI1_97]
vpmovsxbw ymm22, xmmword ptr [rip + .LCPI1_98]
sub r14, qword ptr [rsp + 16]
bextr r9, qword ptr [r8], r9
imul rdi, r15
mov rsi, qword ptr [rbx]
mov r8, qword ptr [rbx + 8]
mov ebx, dword ptr [rdx]
mov edx, dword ptr [rdx + 4]
imul r15, r10
shl r10d, 6
mov r12, r11
shl r12, 6
mov qword ptr [rsp + 64], r10
mov qword ptr [rsp + 72], r15
imul rcx, rdx, 409600
lea r10, [4*rdx]
mov qword ptr [rsp + 80], rdx
mov rdx, rbx
shl rdx, 11
mov qword ptr [rsp + 120], rbx
mov qword ptr [rsp + 56], r10
add rdi, rcx
add rdx, rdi
lea rdi, [4*rax]
imul rax, rax, 409600
lea rdx, [rdx + 2*r9]
mov qword ptr [rsp + 112], rdi
mov rdi, rbx
shl rdi, 6
lea rdx, [r8 + rdx + 307680]
mov qword ptr [rsp + 104], rax
mov r8, r11
shl r8, 11
shl r11, 7
mov qword ptr [rsp + 96], rdi
mov qword ptr [rsp + 24], rdx
mov rdx, rbx
shl rdx, 7
add rdx, rcx
lea rax, [rsi + rdx + 403200]
mov qword ptr [rsp + 48], rax
jmp .LBB1_2
.p2align 4, 0x90
.LBB1_13:
.loc 1 0 3
mov rdx, qword ptr [rsp + 24]
mov rax, qword ptr [rsp + 16]
mov rcx, qword ptr [rsp + 64]
mov r14, qword ptr [rsp + 88]
.loc 1 4 3
add rdx, qword ptr [rsp + 72]
add rax, rcx
sub r14, rcx
mov qword ptr [rsp + 24], rdx
mov qword ptr [rsp + 16], rax
cmp rax, qword ptr [rsp + 32]
jge .LBB1_14
.LBB1_2:
cmp r14, 64
mov edx, 64
mov eax, 1
mov qword ptr [rsp + 88], r14
cmovl rdx, r14
cmp rdx, 2
cmovl rdx, rax
cmp dword ptr [rsp + 80], 134
ja .LBB1_13
.loc 1 0 3
mov rcx, qword ptr [rsp + 32]
mov rax, qword ptr [rsp + 48]
mov rdi, qword ptr [rsp + 24]
mov r10, qword ptr [rsp + 56]
.loc 1 4 3
sub rcx, qword ptr [rsp + 16]
mov qword ptr [rsp + 40], rax
jmp .LBB1_4
.p2align 4, 0x90
.LBB1_12:
.loc 1 0 3
mov rax, qword ptr [rsp + 104]
.loc 1 4 3
add r10, qword ptr [rsp + 112]
add qword ptr [rsp + 40], rax
add rdi, rax
cmp r10, 540
jge .LBB1_13
.LBB1_4:
.loc 1 0 3
cmp dword ptr [rsp + 120], 49
.loc 1 4 3
ja .LBB1_12
.loc 1 0 3
mov r9, qword ptr [rsp + 40]
mov rax, qword ptr [rsp + 96]
mov rbx, rdi
jmp .LBB1_6
.p2align 4, 0x90
.LBB1_11:
.loc 1 4 3
add rax, r12
add rbx, r8
add r9, r11
cmp rax, 3200
jge .LBB1_12
.LBB1_6:
.loc 1 0 3
test rcx, rcx
.loc 1 4 3
jle .LBB1_11
.loc 1 0 3
mov r14, rbx
xor esi, esi
.p2align 4, 0x90
.LBB1_8:
mov r13, -16
mov r15, r14
.p2align 4, 0x90
.LBB1_9:
.loc 1 4 3
vpbroadcastw ymm0, word ptr [r9 + 2*r13 - 403148]
vpbroadcastw xmm1, word ptr [r9 + 2*r13 - 396748]
vmovdqa64 xmm23, xmmword ptr [r9 + 2*r13 - 403152]
vmovdqa64 xmm16, xmmword ptr [r9 + 2*r13 - 390352]
vpbroadcastw xmm2, word ptr [r9 + 2*r13 - 191948]
vmovdqa xmm13, xmmword ptr [r9 + 2*r13 - 403168]
vmovdqa xmm12, xmmword ptr [r9 + 2*r13 - 396768]
vmovdqa xmm5, xmmword ptr [r9 + 2*r13 - 390368]
vmovdqa xmm6, xmmword ptr [r9 + 2*r13 - 383968]
vmovdqa xmm14, xmmword ptr [r9 + 2*r13 - 377568]
vpmovsxbw ymm27, xmmword ptr [rip + .LCPI1_99]
vmovdqa ymm10, ymmword ptr [r9 + 2*r13 - 345568]
vmovdqa64 ymm7, ymm22
vmovdqa64 xmm22, xmmword ptr [r9 + 2*r13 - 326368]
vmovdqa ymm15, ymmword ptr [r9 + 2*r13 - 319968]
vpmovsxbw xmm25, qword ptr [rip + .LCPI1_100]
vmovdqa64 ymm17, ymm26
vpmovsxbw ymm29, xmmword ptr [rip + .LCPI1_101]
vpunpcklwd xmm0, xmm0, xmm1
vpbroadcastd ymm1, dword ptr [r9 + 2*r13 - 390348]
vmovdqa64 ymm21, ymm27
vpblendd xmm0, xmm0, xmm1, 2
vmovdqa xmm1, xmmword ptr [r9 + 2*r13 - 396752]
vmovdqa ymmword ptr [rsp + 320], ymm0
vpsrld xmm0, xmm23, 16
vpblendw xmm0, xmm1, xmm0, 1
vmovdqa64 xmm31, xmm1
vpsrld xmm1, xmm16, 16
vpunpckldq xmm24, xmm0, xmm1
vpbroadcastw ymm0, word ptr [r9 + 2*r13 - 300748]
vpbroadcastw xmm1, word ptr [r9 + 2*r13 - 294348]
vpunpcklwd xmm0, xmm0, xmm1
vpbroadcastd ymm1, dword ptr [r9 + 2*r13 - 287948]
vpblendd xmm0, xmm0, xmm1, 2
vmovdqa xmm1, xmmword ptr [r9 + 2*r13 - 294352]
vmovdqa ymmword ptr [rsp + 288], ymm0
vmovdqa xmm0, xmmword ptr [r9 + 2*r13 - 300752]
vmovdqa xmmword ptr [rsp + 192], xmm1
vmovdqa xmmword ptr [rsp + 224], xmm0
vpsrld xmm0, xmm0, 16
vpblendw xmm0, xmm1, xmm0, 1
vmovdqa xmm1, xmmword ptr [r9 + 2*r13 - 287952]
vmovdqa xmmword ptr [rsp + 160], xmm1
vpsrld xmm1, xmm1, 16
vpunpckldq xmm0, xmm0, xmm1
vpbroadcastd ymm1, dword ptr [r9 + 2*r13 - 185548]
vmovdqa ymmword ptr [rsp + 256], ymm0
vpbroadcastw ymm0, word ptr [r9 + 2*r13 - 198348]
vpunpcklwd xmm0, xmm0, xmm2
vpunpcklwd xmm2, xmm13, xmm12
vpblendd xmm0, xmm0, xmm1, 2
vmovdqa xmm1, xmmword ptr [r9 + 2*r13 - 191952]
vpunpckldq xmm4, xmm2, xmm5
vmovaps xmm2, xmmword ptr [r9 + 2*r13 - 364768]
vmovdqa ymmword ptr [rsp + 128], ymm0
vmovdqa xmm0, xmmword ptr [r9 + 2*r13 - 198352]
insertq xmm4, xmm6, 16, 48
vmovdqa xmmword ptr [rsp + 384], xmm1
vmovdqa xmmword ptr [rsp + 448], xmm0
vpsrld xmm0, xmm0, 16
vpblendw xmm0, xmm1, xmm0, 1
vmovdqa xmm1, xmmword ptr [r9 + 2*r13 - 185552]
vmovdqa xmmword ptr [rsp + 352], xmm1
vpsrld xmm1, xmm1, 16
vpunpckldq xmm0, xmm0, xmm1
vpbroadcastw xmm1, word ptr [r9 + 2*r13 - 89548]
vmovdqa ymmword ptr [rsp + 416], ymm0
vpbroadcastw ymm0, word ptr [r9 + 2*r13 - 95948]
vpunpcklwd xmm0, xmm0, xmm1
vpbroadcastd ymm1, dword ptr [r9 + 2*r13 - 83148]
vpblendd xmm0, xmm0, xmm1, 2
vmovdqa xmm1, xmmword ptr [r9 + 2*r13 - 89552]
vmovdqa ymmword ptr [rsp + 1728], ymm0
vmovdqa xmm0, xmmword ptr [r9 + 2*r13 - 95952]
vmovdqa xmmword ptr [rsp + 768], xmm1
vmovdqa xmmword ptr [rsp + 784], xmm0
vpsrld xmm0, xmm0, 16
vpblendw xmm0, xmm1, xmm0, 1
vmovdqa xmm1, xmmword ptr [r9 + 2*r13 - 83152]
vpsrld xmm3, xmm1, 16
vmovdqa xmmword ptr [rsp + 752], xmm1
vpmovsxbw xmm1, qword ptr [rip + .LCPI1_102]
vpunpckldq xmm0, xmm0, xmm3
vmovdqa xmm3, xmmword ptr [r9 + 2*r13 - 371168]
vmovdqa ymmword ptr [rsp + 1696], ymm0
vpunpcklqdq xmm0, xmm4, xmm14
vpbroadcastw xmm4, xmm3
vpblendw xmm4, xmm0, xmm4, 32
vpbroadcastd ymm0, dword ptr [r9 + 2*r13 - 339168]
vinsertps xmm8, xmm4, xmm2, 48
vmovdqa xmm4, xmmword ptr [r9 + 2*r13 - 358368]
vpbroadcastw xmm9, xmm4
vpblendw xmm8, xmm8, xmm9, 128
vinserti128 ymm9, ymm8, xmmword ptr [r9 + 2*r13 - 351968], 1
vinserti32x4 ymm8, ymm8, xmm22, 1
vpermt2w ymm9, ymm27, ymm10
vmovdqa64 ymm27, ymm7
vpmovsxbw ymm7, xmmword ptr [rip + .LCPI1_103]
vpblendd ymm0, ymm9, ymm0, 32
vmovdqa ymm9, ymmword ptr [r9 + 2*r13 - 332768]
vpermt2w ymm0, ymm28, ymm9
vpmovsxbw ymm28, xmmword ptr [rip + .LCPI1_104]
vshufpd ymm0, ymm0, ymm8, 2
vpmovsxbw ymm8, xmmword ptr [rip + .LCPI1_105]
vpermt2w ymm0, ymm11, ymm15
vpbroadcastd ymm11, dword ptr [r9 + 2*r13 - 313568]
vpblendd ymm0, ymm0, ymm11, 128
vpsrld xmm11, xmm5, 16
vmovdqa ymmword ptr [rsp + 704], ymm0
vpsrld xmm0, xmm13, 16
vpblendw xmm0, xmm0, xmm12, 2
vpunpckldq xmm0, xmm0, xmm11
vpsrld xmm11, xmm14, 16
vpermt2w xmm0, xmm1, xmm6
vpmovsxbw xmm1, qword ptr [rip + .LCPI1_106]
vpunpcklqdq xmm0, xmm0, xmm11
vpbroadcastw xmm11, word ptr [r9 + 2*r13 - 371166]
vpblendw xmm0, xmm0, xmm11, 32
vpslldq xmm11, xmm2, 10
vpblendd xmm0, xmm0, xmm11, 8
vpbroadcastw xmm11, word ptr [r9 + 2*r13 - 358366]
vpblendw xmm0, xmm0, xmm11, 128
vpbroadcastw xmm11, word ptr [r9 + 2*r13 - 403164]
vmovdqa64 ymm30, ymm0
vpbroadcastw xmm0, word ptr [r9 + 2*r13 - 396764]
vpunpcklwd xmm0, xmm11, xmm0
vpslldq xmm11, xmm3, 6
vpblendd xmm0, xmm0, xmm5, 2
vpermt2w xmm0, xmm1, xmm6
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_107]
vshufps xmm0, xmm0, xmm14, 212
vpblendw xmm0, xmm0, xmm11, 32
vpslldq xmm11, xmm4, 10
vinsertps xmm0, xmm0, xmm2, 112
vpblendw xmm0, xmm0, xmm11, 128
vpbroadcastd ymm11, dword ptr [r9 + 2*r13 - 351964]
vpblendd ymm11, ymm0, ymm11, 240
vpermt2w ymm11, ymm1, ymm10
vinserti128 ymm1, ymm0, xmmword ptr [r9 + 2*r13 - 339168], 1
vpbroadcastd ymm0, dword ptr [r9 + 2*r13 - 326364]
vpblendd ymm1, ymm11, ymm1, 34
vpmovsxbw ymm11, xmmword ptr [rip + .LCPI1_108]
vpermt2w ymm1, ymm11, ymm9
vpmovsxbw ymm11, xmmword ptr [rip + .LCPI1_109]
vpblendd ymm0, ymm1, ymm0, 192
vpbroadcastd ymm1, dword ptr [r9 + 2*r13 - 313564]
vpermt2w ymm0, ymm11, ymm15
vpbroadcastq ymm11, qword ptr [r9 + 2*r13 - 351960]
vpblendd ymm0, ymm0, ymm1, 128
vpsrlq xmm1, xmm13, 48
vmovdqa ymmword ptr [rsp + 672], ymm0
vpsrlq xmm0, xmm12, 48
vpunpcklwd xmm0, xmm1, xmm0
vpsrlq xmm1, xmm5, 48
vpunpckldq xmm0, xmm0, xmm1
vpsrlq xmm1, xmm14, 48
vpblendw xmm0, xmm0, xmm6, 8
vpunpcklqdq xmm0, xmm0, xmm1
vpbroadcastw xmm1, word ptr [r9 + 2*r13 - 371162]
vpblendw xmm0, xmm0, xmm1, 32
vpmovzxwd xmm1, xmm2
vpblendd xmm0, xmm0, xmm1, 8
vpbroadcastw xmm1, word ptr [r9 + 2*r13 - 358362]
vpblendw xmm0, xmm0, xmm1, 128
vpbroadcastw xmm1, word ptr [r9 + 2*r13 - 403160]
vmovdqa64 ymm20, ymm0
vpbroadcastw xmm0, word ptr [r9 + 2*r13 - 396760]
vpunpcklwd xmm0, xmm1, xmm0
vpsrldq xmm1, xmm6, 2
vinsertps xmm0, xmm0, xmm5, 156
vpblendw xmm0, xmm0, xmm1, 8
vpslld xmm1, xmm3, 16
vpblendd xmm0, xmm14, xmm0, 3
vpblendw xmm0, xmm0, xmm1, 32
vpsllq xmm1, xmm4, 48
vinsertps xmm0, xmm0, xmm2, 176
vpblendw xmm0, xmm0, xmm1, 128
vpblendd ymm1, ymm0, ymm11, 240
vpbroadcastd ymm11, dword ptr [r9 + 2*r13 - 339160]
vinserti32x4 ymm0, ymm0, xmm22, 1
vpermt2w ymm1, ymm8, ymm10
vpmovsxbw xmm8, qword ptr [rip + .LCPI1_110]
vpblendd ymm1, ymm1, ymm11, 32
vpbroadcastd ymm11, dword ptr [r9 + 2*r13 - 313560]
vmovdqa64 ymm26, ymm8
vpermt2w ymm1, ymm7, ymm9
vmovdqa xmm7, xmmword ptr [r9 + 2*r13 - 371152]
vpblendd ymm0, ymm1, ymm0, 204
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_111]
vpermt2w ymm0, ymm1, ymm15
vpsrldq xmm1, xmm13, 10
vpblendd ymm0, ymm0, ymm11, 128
vmovdqa64 ymm22, ymm0
vpsrldq xmm0, xmm12, 10
vpunpcklwd xmm0, xmm1, xmm0
vpsrldq xmm1, xmm5, 10
vpunpckldq xmm0, xmm0, xmm1
vpbroadcastw xmm1, word ptr [r9 + 2*r13 - 383958]
vpblendw xmm0, xmm0, xmm1, 8
vpsrldq xmm1, xmm14, 10
vpunpcklqdq xmm0, xmm0, xmm1
vpsllq xmm1, xmm2, 16
vpblendw xmm0, xmm0, xmm3, 32
vpblendd xmm0, xmm0, xmm1, 8
vpbroadcastw xmm1, word ptr [r9 + 2*r13 - 358358]
vpblendw xmm0, xmm0, xmm1, 128
vpbroadcastw xmm1, word ptr [r9 + 2*r13 - 403156]
vmovdqa64 ymm18, ymm0
vpbroadcastw xmm0, word ptr [r9 + 2*r13 - 396756]
vpunpcklwd xmm0, xmm1, xmm0
vpsrldq xmm1, xmm6, 6
vpmovsxbw ymm6, xmmword ptr [rip + .LCPI1_112]
vinsertps xmm0, xmm0, xmm5, 220
vpblendw xmm0, xmm0, xmm1, 8
vpsrlq xmm1, xmm3, 16
vpslld xmm3, xmm4, 16
vshufps xmm0, xmm0, xmm14, 244
vpblendw xmm0, xmm0, xmm1, 32
vpbroadcastd ymm1, dword ptr [r9 + 2*r13 - 351956]
vpblendd xmm0, xmm0, xmm2, 8
vpblendw xmm0, xmm0, xmm3, 128
vpbroadcastd ymm3, dword ptr [r9 + 2*r13 - 339156]
vpblendd ymm1, ymm0, ymm1, 240
vinserti128 ymm0, ymm0, xmmword ptr [r9 + 2*r13 - 313568], 1
vpermt2w ymm1, ymm6, ymm10
vpsrldq xmm6, xmm31, 10
vpblendd ymm1, ymm1, ymm3, 32
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_113]
vpermt2w ymm1, ymm3, ymm9
vpbroadcastd ymm3, dword ptr [r9 + 2*r13 - 326356]
vpblendd ymm1, ymm1, ymm3, 192
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_114]
vpermt2w ymm1, ymm3, ymm15
vpsrldq xmm3, xmm13, 14
vpblendd ymm0, ymm1, ymm0, 136
vpsrldq xmm1, xmm5, 14
vmovdqa ymmword ptr [rsp + 640], ymm0
vpsrldq xmm0, xmm12, 14
vmovdqa xmm12, xmmword ptr [r9 + 2*r13 - 383952]
vpunpcklwd xmm0, xmm3, xmm0
vpbroadcastd ymm3, dword ptr [r9 + 2*r13 - 339152]
vpunpckldq xmm0, xmm0, xmm1
vpbroadcastw xmm1, word ptr [r9 + 2*r13 - 383954]
vpblendw xmm0, xmm0, xmm1, 8
vpsrldq xmm1, xmm14, 14
vmovdqa xmm14, xmmword ptr [r9 + 2*r13 - 377552]
vpunpcklqdq xmm0, xmm0, xmm1
vpbroadcastw xmm1, word ptr [r9 + 2*r13 - 371154]
vpblendw xmm0, xmm0, xmm1, 32
vpsrld xmm1, xmm2, 16
vpblendd xmm0, xmm0, xmm1, 8
vpbroadcastw xmm1, xmm7
vpblendw xmm13, xmm0, xmm4, 128
vpbroadcastw ymm0, word ptr [r9 + 2*r13 - 403152]
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_115]
vpunpcklwd xmm0, xmm0, xmm31
vpunpckldq xmm0, xmm0, xmm16
insertq xmm0, xmm12, 16, 48
vpunpcklqdq xmm0, xmm0, xmm14
vpblendw xmm0, xmm0, xmm1, 32
vpbroadcastd xmm1, dword ptr [r9 + 2*r13 - 364752]
vpblendd xmm1, xmm0, xmm1, 8
vmovdqa xmm0, xmmword ptr [r9 + 2*r13 - 358352]
vpbroadcastw xmm2, xmm0
vpblendw xmm1, xmm1, xmm2, 128
vmovdqa ymm2, ymmword ptr [r9 + 2*r13 - 351968]
vpblendd ymm1, ymm1, ymm2, 240
vpermt2w ymm30, ymm17, ymm2
vmovdqa64 ymm17, ymm18
vmovdqa64 ymm18, ymm13
vpermt2w ymm1, ymm4, ymm10
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_116]
vpermt2w ymm30, ymm27, ymm10
vpblendd ymm1, ymm1, ymm3, 32
vpbroadcastq ymm3, qword ptr [r9 + 2*r13 - 326352]
vpermt2w ymm1, ymm4, ymm9
vpbroadcastw xmm4, word ptr [r9 + 2*r13 - 371150]
vpblendd ymm1, ymm1, ymm3, 192
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_117]
vpermt2w ymm1, ymm3, ymm15
vpbroadcastd ymm3, dword ptr [r9 + 2*r13 - 313552]
vpblendd ymm1, ymm1, ymm3, 128
vpsrld xmm3, xmm14, 16
vmovdqa ymmword ptr [rsp + 608], ymm1
vmovdqa ymm1, ymmword ptr [r9 + 2*r13 - 383968]
vpermt2w ymm24, ymm8, ymm1
vpunpcklqdq xmm3, xmm24, xmm3
vpmovsxbw ymm24, xmmword ptr [rip + .LCPI1_118]
vpblendw xmm4, xmm3, xmm4, 32
vmovapd xmm3, xmmword ptr [r9 + 2*r13 - 364752]
vpslldq xmm5, xmm3, 10
vpblendd xmm4, xmm4, xmm5, 8
vpbroadcastw xmm5, word ptr [r9 + 2*r13 - 358350]
vpblendw xmm4, xmm4, xmm5, 128
vpshuflw ymm5, ymm2, 85
vpblendw ymm5, ymm5, ymm10, 2
vpblendd ymm4, ymm4, ymm5, 240
vpmovsxbw ymm5, xmmword ptr [rip + .LCPI1_119]
vmovdqa64 ymm19, ymm4
vmovdqa ymm4, ymmword ptr [rsp + 320]
vpermt2w ymm4, ymm25, ymm1
vmovdqa64 ymm25, ymm21
vshufps xmm1, xmm4, xmm14, 212
vpslldq xmm4, xmm7, 6
vpblendw xmm1, xmm1, xmm4, 32
vpbroadcastd ymm4, dword ptr [r9 + 2*r13 - 364748]
vpblendd xmm1, xmm1, xmm4, 8
vpslldq xmm4, xmm0, 10
vpblendw xmm1, xmm1, xmm4, 128
vpbroadcastd ymm4, dword ptr [r9 + 2*r13 - 351948]
vpblendd ymm4, ymm1, ymm4, 240
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_120]
vpermt2w ymm4, ymm1, ymm10
vmovdqa ymm1, ymmword ptr [r9 + 2*r13 - 339168]
vpblendd ymm4, ymm4, ymm1, 32
vpermt2w ymm30, ymm28, ymm1
vpmovsxbw xmm28, qword ptr [rip + .LCPI1_100]
vpermt2w ymm4, ymm5, ymm9
vpbroadcastd ymm5, dword ptr [r9 + 2*r13 - 326348]
vpermt2w ymm30, ymm29, ymm9
vpblendd ymm4, ymm4, ymm5, 192
vpmovsxbw ymm5, xmmword ptr [rip + .LCPI1_121]
vpermt2w ymm4, ymm5, ymm15
vpbroadcastd ymm5, dword ptr [r9 + 2*r13 - 313548]
vpblendd ymm4, ymm4, ymm5, 128
vpsrlq xmm5, xmm31, 48
vmovdqa ymmword ptr [rsp + 320], ymm4
vpsrlq xmm4, xmm23, 48
vpunpcklwd xmm4, xmm4, xmm5
vpsrlq xmm5, xmm16, 48
vpunpckldq xmm4, xmm4, xmm5
vpsrlq xmm5, xmm14, 48
vpblendw xmm4, xmm4, xmm12, 8
vpunpcklqdq xmm4, xmm4, xmm5
vpbroadcastw xmm5, word ptr [r9 + 2*r13 - 371146]
vpblendw xmm4, xmm4, xmm5, 32
vpmovzxwd xmm5, xmm3
vpblendd xmm4, xmm4, xmm5, 8
vpbroadcastw xmm5, word ptr [r9 + 2*r13 - 358346]
vpblendw xmm4, xmm4, xmm5, 128
vpsrldq xmm5, xmm23, 10
vpunpcklwd xmm5, xmm5, xmm6
vpsrldq xmm6, xmm16, 10
vpunpckldq xmm5, xmm5, xmm6
vpbroadcastw xmm6, word ptr [r9 + 2*r13 - 383942]
vpblendw xmm5, xmm5, xmm6, 8
vpsrldq xmm6, xmm14, 10
vpunpcklqdq xmm5, xmm5, xmm6
vpsllq xmm6, xmm3, 16
vpsrld xmm3, xmm3, 16
vpblendw xmm5, xmm5, xmm7, 32
vpblendd xmm5, xmm5, xmm6, 8
vpbroadcastw xmm6, word ptr [r9 + 2*r13 - 358342]
vpblendw xmm11, xmm5, xmm6, 128
vpsrldq xmm6, xmm23, 14
vpsrldq xmm5, xmm31, 14
vpmovsxbw ymm23, xmmword ptr [rip + .LCPI1_122]
vpunpcklwd xmm5, xmm6, xmm5
vpsrldq xmm6, xmm16, 14
vpunpckldq xmm5, xmm5, xmm6
vpbroadcastw xmm6, word ptr [r9 + 2*r13 - 383938]
vpblendw xmm5, xmm5, xmm6, 8
vpsrldq xmm6, xmm14, 14
vpunpcklqdq xmm5, xmm5, xmm6
vpbroadcastw xmm6, word ptr [r9 + 2*r13 - 371138]
vpblendw xmm5, xmm5, xmm6, 32
vpbroadcastw xmm6, word ptr [r9 + 2*r13 - 396740]
vpblendd xmm3, xmm5, xmm3, 8
vpmovsxbw ymm5, xmmword ptr [rip + .LCPI1_123]
vpermt2w ymm20, ymm5, ymm2
vpmovsxbw ymm5, xmmword ptr [rip + .LCPI1_124]
vpermt2w ymm17, ymm5, ymm2
vpmovsxbw ymm5, xmmword ptr [rip + .LCPI1_125]
vpermt2w ymm18, ymm5, ymm2
vpmovsxbw ymm5, xmmword ptr [rip + .LCPI1_126]
vpermt2w ymm11, ymm5, ymm2
vpblendw xmm5, xmm3, xmm0, 128
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_127]
vpermt2w ymm5, ymm3, ymm2
vpbroadcastd ymm3, dword ptr [rip + .LCPI1_66]
vpermt2w ymm2, ymm3, ymm10
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_128]
vpermt2w ymm2, ymm3, ymm1
vpbroadcastw xmm3, word ptr [r9 + 2*r13 - 396744]
vpblendw ymm2, ymm2, ymm9, 8
vpblendd ymm13, ymm4, ymm2, 240
vpbroadcastw ymm2, word ptr [r9 + 2*r13 - 403144]
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_129]
vpunpcklwd xmm2, xmm2, xmm3
vunpcklps xmm2, xmm2, dword ptr [r9 + 2*r13 - 390344]{1to4}
vpsrldq xmm3, xmm12, 2
vpblendw xmm2, xmm2, xmm3, 8
vpslld xmm3, xmm7, 16
vpblendd xmm2, xmm14, xmm2, 3
vpblendw xmm2, xmm2, xmm3, 32
vpbroadcastd xmm3, dword ptr [r9 + 2*r13 - 364744]
vpblendd xmm2, xmm2, xmm3, 8
vpsllq xmm3, xmm0, 48
vpslld xmm0, xmm0, 16
vpblendw xmm2, xmm2, xmm3, 128
vpbroadcastq ymm3, qword ptr [r9 + 2*r13 - 351944]
vpblendd ymm2, ymm2, ymm3, 240
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_130]
vpermt2w ymm2, ymm3, ymm10
vpbroadcastd ymm3, dword ptr [r9 + 2*r13 - 339144]
vpblendd ymm3, ymm2, ymm3, 32
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_131]
vpermt2w ymm3, ymm2, ymm9
vmovdqa ymm2, ymmword ptr [r9 + 2*r13 - 326368]
vpblendd ymm3, ymm3, ymm2, 192
vpermt2w ymm30, ymm24, ymm2
vpmovsxbw ymm24, xmmword ptr [rip + .LCPI1_105]
vpermt2w ymm3, ymm4, ymm15
vpbroadcastd ymm4, dword ptr [r9 + 2*r13 - 313544]
vpblendd ymm8, ymm3, ymm4, 128
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_132]
vmovdqa ymm3, ymm11
vpermt2w ymm3, ymm4, ymm10
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_133]
vpermt2w ymm3, ymm4, ymm1
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_134]
vpermt2w ymm3, ymm4, ymm9
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_135]
vpermt2w ymm3, ymm4, ymm2
vpbroadcastd ymm4, dword ptr [r9 + 2*r13 - 390340]
vpblendw ymm3, ymm3, ymm15, 32
vpblendd ymm11, ymm11, ymm3, 240
vpbroadcastw ymm3, word ptr [r9 + 2*r13 - 403140]
vpunpcklwd xmm3, xmm3, xmm6
vpblendd xmm3, xmm3, xmm4, 2
vpsrldq xmm4, xmm12, 6
vmovdqa64 ymm12, ymm18
vmovdqa64 xmm18, xmmword ptr [rsp + 160]
vpblendw xmm3, xmm3, xmm4, 8
vpsrlq xmm4, xmm7, 16
vmovdqa64 ymm7, ymm17
vshufps xmm3, xmm3, xmm14, 244
vpmovsxbw ymm14, xmmword ptr [rip + .LCPI1_107]
vpblendw xmm3, xmm3, xmm4, 32
vpbroadcastd ymm4, dword ptr [r9 + 2*r13 - 364740]
vpblendd xmm3, xmm3, xmm4, 8
vpblendw xmm0, xmm3, xmm0, 128
vpbroadcastd ymm3, dword ptr [r9 + 2*r13 - 351940]
vpblendd ymm0, ymm0, ymm3, 240
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_136]
vpermt2w ymm0, ymm3, ymm10
vpbroadcastd ymm3, dword ptr [r9 + 2*r13 - 339140]
vpblendd ymm0, ymm0, ymm3, 32
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_137]
vpermt2w ymm0, ymm3, ymm9
vpbroadcastd ymm3, dword ptr [r9 + 2*r13 - 326340]
vpblendd ymm0, ymm0, ymm3, 192
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_138]
vpermt2w ymm20, ymm3, ymm10
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_139]
vpermt2w ymm7, ymm3, ymm10
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_140]
vpermt2w ymm12, ymm3, ymm10
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_141]
vpermt2w ymm5, ymm3, ymm10
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_142]
vmovdqa ymm10, ymmword ptr [r9 + 2*r13 - 217568]
vpermt2w ymm20, ymm3, ymm1
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_143]
vpermt2w ymm7, ymm3, ymm1
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_144]
vpermt2w ymm12, ymm3, ymm1
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_145]
vpermt2w ymm19, ymm3, ymm1
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_146]
vpermt2w ymm5, ymm3, ymm1
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_147]
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_148]
vpermt2w ymm20, ymm1, ymm9
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_149]
vpermt2w ymm7, ymm1, ymm9
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_150]
vpermt2w ymm12, ymm1, ymm9
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_151]
vpermt2w ymm19, ymm1, ymm9
vmovdqa ymm1, ymm5
vpermt2w ymm1, ymm3, ymm9
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_152]
vpermt2w ymm20, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_153]
vpermt2w ymm7, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_154]
vpermt2w ymm12, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_155]
vpermt2w ymm19, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_156]
vpermt2w ymm13, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_157]
vpermt2w ymm1, ymm3, ymm2
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_158]
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_159]
vpermt2w ymm30, ymm2, ymm15
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_160]
vpermt2w ymm20, ymm2, ymm15
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_161]
vpermt2w ymm7, ymm2, ymm15
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_162]
vpermt2w ymm12, ymm2, ymm15
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_163]
vpermt2w ymm19, ymm2, ymm15
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_164]
vpermt2w ymm13, ymm2, ymm15
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_165]
vpermt2w ymm0, ymm2, ymm15
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_166]
vpermt2w ymm1, ymm2, ymm15
vmovdqa ymm2, ymmword ptr [r9 + 2*r13 - 313568]
vmovdqa xmm15, xmmword ptr [r9 + 2*r13 - 255968]
vpermt2w ymm30, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_167]
vpblendd ymm9, ymm0, ymm2, 128
vpmovsxbw ymm0, xmmword ptr [rip + .LCPI1_168]
vpbroadcastw xmm6, xmm15
vpermt2w ymm20, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_169]
vpermt2w ymm1, ymm0, ymm2
vmovdqa ymm0, ymmword ptr [r9 + 2*r13 - 307168]
vpermt2w ymm7, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_170]
vpermt2w ymm12, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_171]
vpermt2w ymm19, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_172]
vpermt2w ymm13, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_173]
vpermt2w ymm11, ymm3, ymm2
vmovdqa ymm2, ymmword ptr [rsp + 704]
vmovdqa ymm3, ymmword ptr [rsp + 672]
vpermt2w ymm2, ymm23, ymm0
vpbroadcastd ymm23, dword ptr [rip + .LCPI1_66]
vmovdqa ymmword ptr [rsp + 704], ymm2
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_174]
vpermt2w ymm30, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_175]
vmovdqa64 ymmword ptr [rsp + 1344], ymm30
vpermt2w ymm3, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_176]
vmovdqa ymmword ptr [rsp + 672], ymm3
vmovdqa ymm3, ymmword ptr [rsp + 640]
vpermt2w ymm20, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_177]
vmovdqa64 ymmword ptr [rsp + 1664], ymm20
vpmovsxbw ymm20, xmmword ptr [rip + .LCPI1_126]
vpermt2w ymm22, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_178]
vmovdqa64 ymmword ptr [rsp + 1632], ymm22
vpermt2w ymm7, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_179]
vmovdqa ymmword ptr [rsp + 1600], ymm7
vmovdqa ymm7, ymmword ptr [r9 + 2*r13 - 243168]
vpermt2w ymm3, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_180]
vmovdqa ymmword ptr [rsp + 640], ymm3
vmovdqa ymm3, ymmword ptr [rsp + 608]
vpermt2w ymm12, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_181]
vmovdqa ymmword ptr [rsp + 1568], ymm12
vpermt2w ymm3, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_182]
vmovdqa ymmword ptr [rsp + 608], ymm3
vmovdqa ymm3, ymmword ptr [rsp + 320]
vpermt2w ymm19, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_183]
vmovdqa64 ymmword ptr [rsp + 1536], ymm19
vpmovsxbw ymm19, xmmword ptr [rip + .LCPI1_97]
vpermt2w ymm3, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_184]
vmovdqa ymmword ptr [rsp + 320], ymm3
vpermt2w ymm13, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_185]
vmovdqa ymmword ptr [rsp + 1504], ymm13
vpmovsxbw ymm13, xmmword ptr [rip + .LCPI1_95]
vpermt2w ymm8, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_186]
vmovdqa ymmword ptr [rsp + 1472], ymm8
vmovdqa xmm8, xmmword ptr [r9 + 2*r13 - 287968]
vpermt2w ymm11, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_187]
vmovdqa ymmword ptr [rsp + 1440], ymm11
vpbroadcastd ymm11, dword ptr [r9 + 2*r13 - 236768]
vpermt2w ymm9, ymm2, ymm0
vpblendw ymm2, ymm1, ymm0, 128
vmovdqa xmm0, xmmword ptr [r9 + 2*r13 - 300768]
vmovdqa xmm1, xmmword ptr [r9 + 2*r13 - 294368]
vpblendd ymm2, ymm5, ymm2, 240
vmovdqa ymmword ptr [rsp + 1376], ymm2
vmovdqa ymmword ptr [rsp + 1408], ymm9
vmovdqa xmm9, xmmword ptr [r9 + 2*r13 - 275168]
vpunpcklwd xmm3, xmm0, xmm1
vpunpckldq xmm4, xmm3, xmm8
vmovdqa xmm3, xmmword ptr [r9 + 2*r13 - 281568]
insertq xmm4, xmm3, 16, 48
vpunpcklqdq xmm2, xmm4, xmm9
vmovdqa xmm4, xmmword ptr [r9 + 2*r13 - 268768]
vpbroadcastw xmm5, xmm4
vpblendw xmm5, xmm2, xmm5, 32
vmovaps xmm2, xmmword ptr [r9 + 2*r13 - 262368]
vinsertps xmm5, xmm5, xmm2, 48
vpblendw xmm5, xmm5, xmm6, 128
vinserti128 ymm6, ymm5, xmmword ptr [r9 + 2*r13 - 249568], 1
vpermt2w ymm6, ymm21, ymm7
vpblendd ymm12, ymm6, ymm11, 32
vmovdqa ymm6, ymmword ptr [r9 + 2*r13 - 230368]
vmovdqa xmm11, xmmword ptr [r9 + 2*r13 - 223968]
vpermt2w ymm12, ymm13, ymm6
vpmovsxbw ymm13, xmmword ptr [rip + .LCPI1_96]
vinserti128 ymm5, ymm5, xmm11, 1
vshufpd ymm12, ymm12, ymm5, 2
vpermt2w ymm12, ymm13, ymm10
vpbroadcastd ymm13, dword ptr [r9 + 2*r13 - 211168]
vpblendd ymm12, ymm12, ymm13, 128
vpsrld xmm13, xmm8, 16
vmovdqa64 ymm17, ymm12
vpsrld xmm12, xmm0, 16
vpblendw xmm12, xmm12, xmm1, 2
vpunpckldq xmm12, xmm12, xmm13
vpmovsxbw xmm13, qword ptr [rip + .LCPI1_102]
vpermt2w xmm12, xmm13, xmm3
vpsrld xmm13, xmm9, 16
vpunpcklqdq xmm12, xmm12, xmm13
vpbroadcastw xmm13, word ptr [r9 + 2*r13 - 268766]
vpblendw xmm12, xmm12, xmm13, 32
vpslldq xmm13, xmm2, 10
vpblendd xmm12, xmm12, xmm13, 8
vpbroadcastw xmm13, word ptr [r9 + 2*r13 - 255966]
vpblendw xmm5, xmm12, xmm13, 128
vpbroadcastw xmm12, word ptr [r9 + 2*r13 - 294364]
vpbroadcastw xmm13, word ptr [r9 + 2*r13 - 300764]
vmovdqa64 ymm22, ymm5
vpunpcklwd xmm12, xmm13, xmm12
vpmovsxbw xmm13, qword ptr [rip + .LCPI1_106]
vpblendd xmm12, xmm12, xmm8, 2
vpermt2w xmm12, xmm13, xmm3
vpslldq xmm13, xmm4, 6
vshufps xmm12, xmm12, xmm9, 212
vpblendw xmm12, xmm12, xmm13, 32
vpslldq xmm13, xmm15, 10
vinsertps xmm12, xmm12, xmm2, 112
vpblendw xmm12, xmm12, xmm13, 128
vpbroadcastd ymm13, dword ptr [r9 + 2*r13 - 249564]
vpblendd ymm13, ymm12, ymm13, 240
vinserti128 ymm12, ymm12, xmmword ptr [r9 + 2*r13 - 236768], 1
vpermt2w ymm13, ymm14, ymm7
vpbroadcastd ymm14, dword ptr [r9 + 2*r13 - 223964]
vpblendd ymm12, ymm13, ymm12, 34
vpmovsxbw ymm13, xmmword ptr [rip + .LCPI1_108]
vpermt2w ymm12, ymm13, ymm6
vpbroadcastd ymm13, dword ptr [r9 + 2*r13 - 211164]
vpblendd ymm12, ymm12, ymm14, 192
vpmovsxbw ymm14, xmmword ptr [rip + .LCPI1_109]
vpermt2w ymm12, ymm14, ymm10
vpbroadcastq ymm14, qword ptr [r9 + 2*r13 - 249560]
vpblendd ymm12, ymm12, ymm13, 128
vpsrlq xmm13, xmm0, 48
vmovdqa ymmword ptr [rsp + 576], ymm12
vpsrlq xmm12, xmm1, 48
vpunpcklwd xmm12, xmm13, xmm12
vpsrlq xmm13, xmm8, 48
vpunpckldq xmm12, xmm12, xmm13
vpsrlq xmm13, xmm9, 48
vpblendw xmm12, xmm12, xmm3, 8
vpunpcklqdq xmm12, xmm12, xmm13
vpbroadcastw xmm13, word ptr [r9 + 2*r13 - 268762]
vpblendw xmm12, xmm12, xmm13, 32
vpmovzxwd xmm13, xmm2
vpblendd xmm12, xmm12, xmm13, 8
vpbroadcastw xmm13, word ptr [r9 + 2*r13 - 255962]
vpblendw xmm5, xmm12, xmm13, 128
vpbroadcastw xmm12, word ptr [r9 + 2*r13 - 294360]
vpbroadcastw xmm13, word ptr [r9 + 2*r13 - 300760]
vmovdqa64 ymm29, ymm5
vpmovsxbw ymm5, xmmword ptr [rip + .LCPI1_105]
vpunpcklwd xmm12, xmm13, xmm12
vpsrldq xmm13, xmm3, 2
vpsrldq xmm3, xmm3, 6
vinsertps xmm12, xmm12, xmm8, 156
vpblendw xmm12, xmm12, xmm13, 8
vpslld xmm13, xmm4, 16
vpblendd xmm12, xmm9, xmm12, 3
vpblendw xmm12, xmm12, xmm13, 32
vpsllq xmm13, xmm15, 48
vinsertps xmm12, xmm12, xmm2, 176
vpblendw xmm12, xmm12, xmm13, 128
vpblendd ymm13, ymm12, ymm14, 240
vpbroadcastd ymm14, dword ptr [r9 + 2*r13 - 236760]
vinserti128 ymm11, ymm12, xmm11, 1
vpmovsxbw ymm12, xmmword ptr [rip + .LCPI1_103]
vpermt2w ymm13, ymm5, ymm7
vpblendd ymm13, ymm13, ymm14, 32
vmovdqa xmm14, xmmword ptr [r9 + 2*r13 - 281552]
vpermt2w ymm13, ymm12, ymm6
vpbroadcastd ymm12, dword ptr [r9 + 2*r13 - 211160]
vpblendd ymm11, ymm13, ymm11, 204
vpmovsxbw ymm13, xmmword ptr [rip + .LCPI1_111]
vpermt2w ymm11, ymm13, ymm10
vmovapd xmm13, xmmword ptr [rsp + 192]
vpblendd ymm11, ymm11, ymm12, 128
vpsrldq xmm12, xmm0, 10
vpsrldq xmm0, xmm0, 14
vmovdqa64 ymm31, ymm11
vpsrldq xmm11, xmm1, 10
vpsrldq xmm1, xmm1, 14
vpunpcklwd xmm11, xmm12, xmm11
vpsrldq xmm12, xmm8, 10
vpunpcklwd xmm0, xmm0, xmm1
vpbroadcastw xmm1, word ptr [r9 + 2*r13 - 281554]
vpunpckldq xmm11, xmm11, xmm12
vpbroadcastw xmm12, word ptr [r9 + 2*r13 - 281558]
vpblendw xmm11, xmm11, xmm12, 8
vpsrldq xmm12, xmm9, 10
vpunpcklqdq xmm11, xmm11, xmm12
vpsllq xmm12, xmm2, 16
vpblendw xmm11, xmm11, xmm4, 32
vpsrlq xmm4, xmm4, 16
vpblendd xmm11, xmm11, xmm12, 8
vpbroadcastw xmm12, word ptr [r9 + 2*r13 - 255958]
vpblendw xmm5, xmm11, xmm12, 128
vpbroadcastw xmm11, word ptr [r9 + 2*r13 - 294356]
vpbroadcastw xmm12, word ptr [r9 + 2*r13 - 300756]
vmovdqa64 ymm30, ymm5
vmovdqa64 ymm21, ymm30
vpunpcklwd xmm11, xmm12, xmm11
vpmovsxbw ymm12, xmmword ptr [rip + .LCPI1_112]
vinsertps xmm11, xmm11, xmm8, 220
vpblendw xmm3, xmm11, xmm3, 8
vpslld xmm11, xmm15, 16
vshufps xmm3, xmm3, xmm9, 244
vpblendw xmm3, xmm3, xmm4, 32
vpbroadcastd ymm4, dword ptr [r9 + 2*r13 - 249556]
vpblendd xmm3, xmm3, xmm2, 8
vpblendw xmm3, xmm3, xmm11, 128
vpbroadcastd ymm11, dword ptr [r9 + 2*r13 - 236756]
vpblendd ymm4, ymm3, ymm4, 240
vinserti128 ymm3, ymm3, xmmword ptr [r9 + 2*r13 - 211168], 1
vpermt2w ymm4, ymm12, ymm7
vpsrldq xmm12, xmm13, 10
vpblendd ymm4, ymm4, ymm11, 32
vpmovsxbw ymm11, xmmword ptr [rip + .LCPI1_113]
vpermt2w ymm4, ymm11, ymm6
vpbroadcastd ymm11, dword ptr [r9 + 2*r13 - 223956]
vpblendd ymm4, ymm4, ymm11, 192
vpmovsxbw ymm11, xmmword ptr [rip + .LCPI1_114]
vpermt2w ymm4, ymm11, ymm10
vpblendd ymm3, ymm4, ymm3, 136
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_115]
vmovdqa ymmword ptr [rsp + 544], ymm3
vpsrldq xmm3, xmm8, 14
vmovdqa xmm8, xmmword ptr [rsp + 224]
vpunpckldq xmm0, xmm0, xmm3
vpbroadcastd ymm3, dword ptr [r9 + 2*r13 - 236752]
vpblendw xmm0, xmm0, xmm1, 8
vpsrldq xmm1, xmm9, 14
vmovdqa xmm9, xmmword ptr [r9 + 2*r13 - 275152]
vpunpcklqdq xmm0, xmm0, xmm1
vpbroadcastw xmm1, word ptr [r9 + 2*r13 - 268754]
vpblendw xmm0, xmm0, xmm1, 32
vpsrld xmm1, xmm2, 16
vpblendd xmm0, xmm0, xmm1, 8
vpblendw xmm5, xmm0, xmm15, 128
vpbroadcastw ymm0, word ptr [r9 + 2*r13 - 300752]
vmovdqa xmm15, xmmword ptr [r9 + 2*r13 - 268752]
vpunpcklwd xmm0, xmm0, xmm13
vpbroadcastw xmm1, xmm15
vpunpckldq xmm0, xmm0, xmm18
insertq xmm0, xmm14, 16, 48
vpunpcklqdq xmm0, xmm0, xmm9
vpblendw xmm0, xmm0, xmm1, 32
vpbroadcastd xmm1, dword ptr [r9 + 2*r13 - 262352]
vpblendd xmm1, xmm0, xmm1, 8
vmovdqa xmm0, xmmword ptr [r9 + 2*r13 - 255952]
vpbroadcastw xmm2, xmm0
vpblendw xmm1, xmm1, xmm2, 128
vmovdqa ymm2, ymmword ptr [r9 + 2*r13 - 249568]
vpblendd ymm1, ymm1, ymm2, 240
vpermt2w ymm1, ymm4, ymm7
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_116]
vpblendd ymm1, ymm1, ymm3, 32
vpbroadcastq ymm3, qword ptr [r9 + 2*r13 - 223952]
vpermt2w ymm1, ymm4, ymm6
vmovdqa ymm4, ymmword ptr [rsp + 256]
vpblendd ymm1, ymm1, ymm3, 192
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_117]
vpermt2w ymm1, ymm3, ymm10
vpbroadcastd ymm3, dword ptr [r9 + 2*r13 - 211152]
vpblendd ymm1, ymm1, ymm3, 128
vpsrld xmm3, xmm9, 16
vmovdqa ymmword ptr [rsp + 512], ymm1
vmovdqa ymm1, ymmword ptr [r9 + 2*r13 - 281568]
vpermt2w ymm4, ymm26, ymm1
vpunpcklqdq xmm3, xmm4, xmm3
vpbroadcastw xmm4, word ptr [r9 + 2*r13 - 268750]
vpblendw xmm4, xmm3, xmm4, 32
vmovapd xmm3, xmmword ptr [r9 + 2*r13 - 262352]
vpslldq xmm11, xmm3, 10
vpblendd xmm4, xmm4, xmm11, 8
vpbroadcastw xmm11, word ptr [r9 + 2*r13 - 255950]
vpblendw xmm4, xmm4, xmm11, 128
vpshuflw ymm11, ymm2, 85
vpblendw ymm11, ymm11, ymm7, 2
vpblendd ymm4, ymm4, ymm11, 240
vpmovsxbw ymm11, xmmword ptr [rip + .LCPI1_119]
vmovdqa64 ymm16, ymm4
vmovdqa ymm4, ymmword ptr [rsp + 288]
vpermt2w ymm4, ymm28, ymm1
vshufps xmm1, xmm4, xmm9, 212
vpslldq xmm4, xmm15, 6
vpblendw xmm1, xmm1, xmm4, 32
vpbroadcastd ymm4, dword ptr [r9 + 2*r13 - 262348]
vpblendd xmm1, xmm1, xmm4, 8
vpslldq xmm4, xmm0, 10
vpblendw xmm1, xmm1, xmm4, 128
vpbroadcastd ymm4, dword ptr [r9 + 2*r13 - 249548]
vpblendd ymm4, ymm1, ymm4, 240
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_120]
vpermt2w ymm4, ymm1, ymm7
vmovdqa ymm1, ymmword ptr [r9 + 2*r13 - 236768]
vpblendd ymm4, ymm4, ymm1, 32
vpermt2w ymm4, ymm11, ymm6
vpbroadcastd ymm11, dword ptr [r9 + 2*r13 - 223948]
vpblendd ymm4, ymm4, ymm11, 192
vpmovsxbw ymm11, xmmword ptr [rip + .LCPI1_121]
vpermt2w ymm4, ymm11, ymm10
vpbroadcastd ymm11, dword ptr [r9 + 2*r13 - 211148]
vpblendd ymm4, ymm4, ymm11, 128
vpsrlq xmm11, xmm13, 48
vmovdqa ymmword ptr [rsp + 288], ymm4
vpsrlq xmm4, xmm8, 48
vpunpcklwd xmm4, xmm4, xmm11
vpsrlq xmm11, xmm18, 48
vpunpckldq xmm4, xmm4, xmm11
vpsrlq xmm11, xmm9, 48
vpblendw xmm4, xmm4, xmm14, 8
vpunpcklqdq xmm4, xmm4, xmm11
vpbroadcastw xmm11, word ptr [r9 + 2*r13 - 268746]
vpblendw xmm4, xmm4, xmm11, 32
vpmovzxwd xmm11, xmm3
vpblendd xmm4, xmm4, xmm11, 8
vpbroadcastw xmm11, word ptr [r9 + 2*r13 - 255946]
vpblendw xmm4, xmm4, xmm11, 128
vpsrldq xmm11, xmm8, 10
vpunpcklwd xmm11, xmm11, xmm12
vpsrldq xmm12, xmm18, 10
vpunpckldq xmm11, xmm11, xmm12
vpbroadcastw xmm12, word ptr [r9 + 2*r13 - 281542]
vpblendw xmm11, xmm11, xmm12, 8
vpsrldq xmm12, xmm9, 10
vpunpcklqdq xmm11, xmm11, xmm12
vpsllq xmm12, xmm3, 16
vpsrld xmm3, xmm3, 16
vpblendw xmm11, xmm11, xmm15, 32
vpblendd xmm11, xmm11, xmm12, 8
vpbroadcastw xmm12, word ptr [r9 + 2*r13 - 255942]
vpblendw xmm12, xmm11, xmm12, 128
vpsrldq xmm11, xmm13, 14
vpsrldq xmm13, xmm8, 14
vpunpcklwd xmm11, xmm13, xmm11
vpsrldq xmm13, xmm18, 14
vmovdqa64 ymm18, ymm22
vmovdqa64 ymm22, ymm5
vpermt2w ymm12, ymm20, ymm2
vpermt2w ymm18, ymm19, ymm2
vpmovsxbw ymm20, xmmword ptr [rip + .LCPI1_104]
vpunpckldq xmm11, xmm11, xmm13
vpbroadcastw xmm13, word ptr [r9 + 2*r13 - 281538]
vpblendw xmm11, xmm11, xmm13, 8
vpsrldq xmm13, xmm9, 14
vpunpcklqdq xmm11, xmm11, xmm13
vpbroadcastw xmm13, word ptr [r9 + 2*r13 - 268738]
vpblendw xmm11, xmm11, xmm13, 32
vpblendd xmm3, xmm11, xmm3, 8
vpmovsxbw ymm11, xmmword ptr [rip + .LCPI1_123]
vpermt2w ymm29, ymm11, ymm2
vpmovsxbw ymm11, xmmword ptr [rip + .LCPI1_124]
vpermt2w ymm21, ymm11, ymm2
vpmovsxbw ymm11, xmmword ptr [rip + .LCPI1_125]
vpermt2w ymm22, ymm11, ymm2
vpblendw xmm11, xmm3, xmm0, 128
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_127]
vpermt2w ymm11, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_128]
vpermt2w ymm2, ymm23, ymm7
vpmovsxbw ymm23, xmmword ptr [rip + .LCPI1_118]
vpermt2w ymm2, ymm3, ymm1
vpbroadcastw xmm3, word ptr [r9 + 2*r13 - 294344]
vpblendw ymm2, ymm2, ymm6, 8
vpblendd ymm13, ymm4, ymm2, 240
vpbroadcastw ymm2, word ptr [r9 + 2*r13 - 300744]
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_129]
vpunpcklwd xmm2, xmm2, xmm3
vunpcklps xmm2, xmm2, dword ptr [r9 + 2*r13 - 287944]{1to4}
vpsrldq xmm3, xmm14, 2
vpblendw xmm2, xmm2, xmm3, 8
vpslld xmm3, xmm15, 16
vpblendd xmm2, xmm9, xmm2, 3
vpblendw xmm2, xmm2, xmm3, 32
vpbroadcastd xmm3, dword ptr [r9 + 2*r13 - 262344]
vpblendd xmm2, xmm2, xmm3, 8
vpsllq xmm3, xmm0, 48
vpslld xmm0, xmm0, 16
vpblendw xmm2, xmm2, xmm3, 128
vpbroadcastq ymm3, qword ptr [r9 + 2*r13 - 249544]
vpblendd ymm2, ymm2, ymm3, 240
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_130]
vpermt2w ymm2, ymm3, ymm7
vpbroadcastd ymm3, dword ptr [r9 + 2*r13 - 236744]
vpblendd ymm3, ymm2, ymm3, 32
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_131]
vpermt2w ymm3, ymm2, ymm6
vmovdqa ymm2, ymmword ptr [r9 + 2*r13 - 223968]
vpblendd ymm3, ymm3, ymm2, 192
vpermt2w ymm3, ymm4, ymm10
vpbroadcastd ymm4, dword ptr [r9 + 2*r13 - 211144]
vpblendd ymm5, ymm3, ymm4, 128
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_132]
vmovdqa ymm3, ymm12
vpermt2w ymm3, ymm4, ymm7
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_133]
vpermt2w ymm3, ymm4, ymm1
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_134]
vpermt2w ymm3, ymm4, ymm6
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_135]
vpermt2w ymm3, ymm4, ymm2
vpbroadcastd ymm4, dword ptr [r9 + 2*r13 - 287940]
vpblendw ymm3, ymm3, ymm10, 32
vpblendd ymm8, ymm12, ymm3, 240
vpbroadcastw ymm3, word ptr [r9 + 2*r13 - 300740]
vpbroadcastw xmm12, word ptr [r9 + 2*r13 - 294340]
vpunpcklwd xmm3, xmm3, xmm12
vpblendd xmm3, xmm3, xmm4, 2
vpsrldq xmm4, xmm14, 6
vmovdqa64 ymm14, ymm21
vpblendw xmm3, xmm3, xmm4, 8
vpsrlq xmm4, xmm15, 16
vmovdqa64 ymm15, ymm18
vpermt2w ymm15, ymm27, ymm7
vpmovsxbw ymm18, xmmword ptr [rip + .LCPI1_107]
vshufps xmm3, xmm3, xmm9, 244
vmovdqa xmm9, xmmword ptr [r9 + 2*r13 - 179168]
vpblendw xmm3, xmm3, xmm4, 32
vpbroadcastd ymm4, dword ptr [r9 + 2*r13 - 262340]
vpermt2w ymm15, ymm20, ymm1
vpblendd xmm3, xmm3, xmm4, 8
vmovdqa64 ymm4, ymm22
vpmovsxbw ymm22, xmmword ptr [rip + .LCPI1_122]
vpblendw xmm0, xmm3, xmm0, 128
vpbroadcastd ymm3, dword ptr [r9 + 2*r13 - 249540]
vpblendd ymm0, ymm0, ymm3, 240
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_136]
vpermt2w ymm0, ymm3, ymm7
vpbroadcastd ymm3, dword ptr [r9 + 2*r13 - 236740]
vpblendd ymm0, ymm0, ymm3, 32
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_137]
vpermt2w ymm0, ymm3, ymm6
vpbroadcastd ymm3, dword ptr [r9 + 2*r13 - 223940]
vpblendd ymm0, ymm0, ymm3, 192
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_138]
vpermt2w ymm29, ymm3, ymm7
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_139]
vpermt2w ymm14, ymm3, ymm7
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_140]
vpermt2w ymm4, ymm3, ymm7
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_141]
vpermt2w ymm11, ymm3, ymm7
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_142]
vmovaps xmm7, xmmword ptr [r9 + 2*r13 - 159968]
vpermt2w ymm29, ymm3, ymm1
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_143]
vpermt2w ymm14, ymm3, ymm1
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_144]
vpermt2w ymm4, ymm3, ymm1
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_145]
vpermt2w ymm16, ymm3, ymm1
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_146]
vpermt2w ymm11, ymm3, ymm1
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_101]
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_148]
vpermt2w ymm15, ymm1, ymm6
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_147]
vpermt2w ymm15, ymm23, ymm2
vpermt2w ymm29, ymm1, ymm6
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_149]
vpermt2w ymm14, ymm1, ymm6
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_150]
vpermt2w ymm4, ymm1, ymm6
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_151]
vpermt2w ymm16, ymm1, ymm6
vmovdqa ymm1, ymm11
vpermt2w ymm1, ymm3, ymm6
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_152]
vpermt2w ymm29, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_153]
vpermt2w ymm14, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_154]
vpermt2w ymm4, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_155]
vpermt2w ymm16, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_156]
vpermt2w ymm13, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_157]
vpermt2w ymm1, ymm3, ymm2
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_158]
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_159]
vpermt2w ymm15, ymm2, ymm10
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_160]
vpermt2w ymm29, ymm2, ymm10
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_161]
vpermt2w ymm14, ymm2, ymm10
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_162]
vpermt2w ymm4, ymm2, ymm10
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_163]
vpermt2w ymm16, ymm2, ymm10
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_164]
vpermt2w ymm13, ymm2, ymm10
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_165]
vpermt2w ymm0, ymm2, ymm10
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_166]
vpermt2w ymm1, ymm2, ymm10
vmovdqa ymm2, ymmword ptr [r9 + 2*r13 - 211168]
vmovdqa xmm10, xmmword ptr [r9 + 2*r13 - 166368]
vpermt2w ymm15, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_167]
vpblendd ymm6, ymm0, ymm2, 128
vpmovsxbw ymm0, xmmword ptr [rip + .LCPI1_168]
vpermt2w ymm29, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_169]
vpermt2w ymm1, ymm0, ymm2
vmovdqa ymm0, ymmword ptr [r9 + 2*r13 - 204768]
vpermt2w ymm14, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_170]
vpermt2w ymm17, ymm22, ymm0
vmovdqa64 ymmword ptr [rsp + 1088], ymm17
vpmovsxbw xmm17, qword ptr [rip + .LCPI1_106]
vpermt2w ymm4, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_171]
vpermt2w ymm16, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_172]
vpermt2w ymm13, ymm3, ymm2
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_173]
vpermt2w ymm8, ymm3, ymm2
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_174]
vmovdqa ymm3, ymmword ptr [rsp + 576]
vpermt2w ymm15, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_175]
vmovdqa ymmword ptr [rsp + 992], ymm15
vmovdqa xmm15, xmmword ptr [rsp + 352]
vpermt2w ymm3, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_176]
vmovdqa ymmword ptr [rsp + 576], ymm3
vmovdqa ymm3, ymmword ptr [rsp + 544]
vpermt2w ymm29, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_177]
vmovdqa64 ymmword ptr [rsp + 864], ymm29
vmovdqa64 ymm29, ymm25
vpermt2w ymm31, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_178]
vmovdqa64 ymmword ptr [rsp + 1312], ymm31
vpermt2w ymm14, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_179]
vmovdqa ymmword ptr [rsp + 1024], ymm14
vpmovsxbw ymm14, xmmword ptr [rip + .LCPI1_107]
vpermt2w ymm3, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_180]
vmovdqa ymmword ptr [rsp + 544], ymm3
vmovdqa ymm3, ymmword ptr [rsp + 512]
vpermt2w ymm4, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_181]
vmovdqa ymmword ptr [rsp + 1280], ymm4
vmovdqa xmm4, xmmword ptr [r9 + 2*r13 - 185568]
vpermt2w ymm3, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_182]
vmovdqa ymmword ptr [rsp + 512], ymm3
vmovdqa ymm3, ymmword ptr [rsp + 288]
vpermt2w ymm16, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_183]
vmovdqa64 ymmword ptr [rsp + 1248], ymm16
vpermt2w ymm3, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_184]
vmovdqa ymmword ptr [rsp + 288], ymm3
vmovdqa xmm3, xmmword ptr [r9 + 2*r13 - 198368]
vpermt2w ymm13, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_185]
vmovdqa ymmword ptr [rsp + 1216], ymm13
vpmovsxbw ymm13, xmmword ptr [rip + .LCPI1_95]
vpermt2w ymm5, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_186]
vmovdqa ymmword ptr [rsp + 1184], ymm5
vmovdqa xmm5, xmmword ptr [r9 + 2*r13 - 172768]
vpermt2w ymm8, ymm2, ymm0
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_187]
vmovdqa ymmword ptr [rsp + 1152], ymm8
vmovdqa xmm8, xmmword ptr [r9 + 2*r13 - 153568]
vpermt2w ymm6, ymm2, ymm0
vpblendw ymm0, ymm1, ymm0, 128
vmovdqa ymm2, ymmword ptr [r9 + 2*r13 - 140768]
vpblendd ymm0, ymm11, ymm0, 240
vpbroadcastd ymm11, dword ptr [r9 + 2*r13 - 134368]
vmovdqa ymmword ptr [rsp + 1056], ymm0
vmovdqa ymmword ptr [rsp + 1120], ymm6
vmovdqa xmm6, xmmword ptr [r9 + 2*r13 - 191968]
vpunpcklwd xmm1, xmm3, xmm6
vpunpckldq xmm1, xmm1, xmm4
insertq xmm1, xmm9, 16, 48
vpunpcklqdq xmm0, xmm1, xmm5
vpbroadcastw xmm1, xmm10
vpblendw xmm0, xmm0, xmm1, 32
vpbroadcastw xmm1, xmm8
vinsertps xmm0, xmm0, xmm7, 48
vpblendw xmm0, xmm0, xmm1, 128
vinserti128 ymm1, ymm0, xmmword ptr [r9 + 2*r13 - 147168], 1
vpermt2w ymm1, ymm25, ymm2
vpblendd ymm12, ymm1, ymm11, 32
vmovdqa ymm1, ymmword ptr [r9 + 2*r13 - 127968]
vmovdqa xmm11, xmmword ptr [r9 + 2*r13 - 121568]
vpermt2w ymm12, ymm13, ymm1
vinserti128 ymm0, ymm0, xmm11, 1
vpmovsxbw ymm13, xmmword ptr [rip + .LCPI1_96]
vshufpd ymm12, ymm12, ymm0, 2
vmovdqa ymm0, ymmword ptr [r9 + 2*r13 - 115168]
vpermt2w ymm12, ymm13, ymm0
vpbroadcastd ymm13, dword ptr [r9 + 2*r13 - 108768]
vpblendd ymm12, ymm12, ymm13, 128
vpsrld xmm13, xmm4, 16
vmovdqa ymmword ptr [rsp + 256], ymm12
vpsrld xmm12, xmm3, 16
vpblendw xmm12, xmm12, xmm6, 2
vpunpckldq xmm12, xmm12, xmm13
vpmovsxbw xmm13, qword ptr [rip + .LCPI1_102]
vpermt2w xmm12, xmm13, xmm9
vpsrld xmm13, xmm5, 16
vpunpcklqdq xmm12, xmm12, xmm13
vpbroadcastw xmm13, word ptr [r9 + 2*r13 - 166366]
vpblendw xmm12, xmm12, xmm13, 32
vpslldq xmm13, xmm7, 10
vpblendd xmm12, xmm12, xmm13, 8
vpbroadcastw xmm13, word ptr [r9 + 2*r13 - 153566]
vpblendw xmm12, xmm12, xmm13, 128
vpbroadcastw xmm13, word ptr [r9 + 2*r13 - 198364]
vmovdqa64 ymm21, ymm12
vpbroadcastw xmm12, word ptr [r9 + 2*r13 - 191964]
vpunpcklwd xmm12, xmm13, xmm12
vpmovsxbw xmm13, qword ptr [rip + .LCPI1_106]
vpblendd xmm12, xmm12, xmm4, 2
vpermt2w xmm12, xmm13, xmm9
vpslldq xmm13, xmm10, 6
vshufps xmm12, xmm12, xmm5, 212
vpblendw xmm12, xmm12, xmm13, 32
vpslldq xmm13, xmm8, 10
vinsertps xmm12, xmm12, xmm7, 112
vpblendw xmm12, xmm12, xmm13, 128
vpbroadcastd ymm13, dword ptr [r9 + 2*r13 - 147164]
vpblendd ymm13, ymm12, ymm13, 240
vinserti128 ymm12, ymm12, xmmword ptr [r9 + 2*r13 - 134368], 1
vpermt2w ymm13, ymm14, ymm2
vpbroadcastd ymm14, dword ptr [r9 + 2*r13 - 121564]
vpblendd ymm12, ymm13, ymm12, 34
vpmovsxbw ymm13, xmmword ptr [rip + .LCPI1_108]
vpermt2w ymm12, ymm13, ymm1
vpbroadcastd ymm13, dword ptr [r9 + 2*r13 - 108764]
vpblendd ymm12, ymm12, ymm14, 192
vpmovsxbw ymm14, xmmword ptr [rip + .LCPI1_109]
vpermt2w ymm12, ymm14, ymm0
vpbroadcastq ymm14, qword ptr [r9 + 2*r13 - 147160]
vpblendd ymm12, ymm12, ymm13, 128
vpsrlq xmm13, xmm3, 48
vmovdqa ymmword ptr [rsp + 224], ymm12
vpsrlq xmm12, xmm6, 48
vpunpcklwd xmm12, xmm13, xmm12
vpsrlq xmm13, xmm4, 48
vpunpckldq xmm12, xmm12, xmm13
vpsrlq xmm13, xmm5, 48
vpblendw xmm12, xmm12, xmm9, 8
vpunpcklqdq xmm12, xmm12, xmm13
vpbroadcastw xmm13, word ptr [r9 + 2*r13 - 166362]
vpblendw xmm12, xmm12, xmm13, 32
vpmovzxwd xmm13, xmm7
vpblendd xmm12, xmm12, xmm13, 8
vpbroadcastw xmm13, word ptr [r9 + 2*r13 - 153562]
vpblendw xmm12, xmm12, xmm13, 128
vpbroadcastw xmm13, word ptr [r9 + 2*r13 - 198360]
vmovdqa64 ymm31, ymm12
vpbroadcastw xmm12, word ptr [r9 + 2*r13 - 191960]
vpunpcklwd xmm12, xmm13, xmm12
vpsrldq xmm13, xmm9, 2
vpsrldq xmm9, xmm9, 6
vinsertps xmm12, xmm12, xmm4, 156
vpblendw xmm12, xmm12, xmm13, 8
vpslld xmm13, xmm10, 16
vpblendd xmm12, xmm5, xmm12, 3
vpblendw xmm12, xmm12, xmm13, 32
vpsllq xmm13, xmm8, 48
vinsertps xmm12, xmm12, xmm7, 176
vpblendw xmm12, xmm12, xmm13, 128
vpblendd ymm13, ymm12, ymm14, 240
vpbroadcastd ymm14, dword ptr [r9 + 2*r13 - 134360]
vinserti128 ymm11, ymm12, xmm11, 1
vpmovsxbw ymm12, xmmword ptr [rip + .LCPI1_103]
vpermt2w ymm13, ymm24, ymm2
vpblendd ymm13, ymm13, ymm14, 32
vmovdqa xmm14, xmmword ptr [rsp + 384]
vpermt2w ymm13, ymm12, ymm1
vpbroadcastd ymm12, dword ptr [r9 + 2*r13 - 108760]
vpblendd ymm11, ymm13, ymm11, 204
vpmovsxbw ymm13, xmmword ptr [rip + .LCPI1_111]
vpermt2w ymm11, ymm13, ymm0
vmovdqa xmm13, xmmword ptr [rsp + 448]
vpblendd ymm11, ymm11, ymm12, 128
vpsrldq xmm12, xmm3, 10
vpsrldq xmm3, xmm3, 14
vmovdqa ymmword ptr [rsp + 192], ymm11
vpsrldq xmm11, xmm6, 10
vpsrldq xmm6, xmm6, 14
vpunpcklwd xmm11, xmm12, xmm11
vpsrldq xmm12, xmm4, 10
vpunpcklwd xmm3, xmm3, xmm6
vpunpckldq xmm11, xmm11, xmm12
vpbroadcastw xmm12, word ptr [r9 + 2*r13 - 179158]
vpblendw xmm11, xmm11, xmm12, 8
vpsrldq xmm12, xmm5, 10
vpunpcklqdq xmm11, xmm11, xmm12
vpsllq xmm12, xmm7, 16
vpblendw xmm11, xmm11, xmm10, 32
vpsrlq xmm10, xmm10, 16
vpblendd xmm11, xmm11, xmm12, 8
vpbroadcastw xmm12, word ptr [r9 + 2*r13 - 153558]
vpblendw xmm11, xmm11, xmm12, 128
vpbroadcastw xmm12, word ptr [r9 + 2*r13 - 198356]
vmovdqa64 ymm16, ymm11
vpbroadcastw xmm11, word ptr [r9 + 2*r13 - 191956]
vpunpcklwd xmm11, xmm12, xmm11
vpmovsxbw ymm12, xmmword ptr [rip + .LCPI1_112]
vinsertps xmm11, xmm11, xmm4, 220
vpsrldq xmm4, xmm4, 14
vpunpckldq xmm3, xmm3, xmm4
vpbroadcastw xmm4, word ptr [r9 + 2*r13 - 179154]
vpblendw xmm9, xmm11, xmm9, 8
vpslld xmm11, xmm8, 16
vshufps xmm9, xmm9, xmm5, 244
vpblendw xmm9, xmm9, xmm10, 32
vpbroadcastd ymm10, dword ptr [r9 + 2*r13 - 147156]
vpblendd xmm9, xmm9, xmm7, 8
vpblendw xmm9, xmm9, xmm11, 128
vpbroadcastd ymm11, dword ptr [r9 + 2*r13 - 134356]
vpblendw xmm3, xmm3, xmm4, 8
vpsrldq xmm4, xmm5, 14
vpunpcklqdq xmm3, xmm3, xmm4
vpbroadcastw xmm4, word ptr [r9 + 2*r13 - 166354]
vpblendd ymm10, ymm9, ymm10, 240
vinserti128 ymm9, ymm9, xmmword ptr [r9 + 2*r13 - 108768], 1
vpermt2w ymm10, ymm12, ymm2
vpblendd ymm10, ymm10, ymm11, 32
vpmovsxbw ymm11, xmmword ptr [rip + .LCPI1_113]
vpblendw xmm3, xmm3, xmm4, 32
vpsrld xmm4, xmm7, 16
vpblendd xmm3, xmm3, xmm4, 8
vmovdqa xmm4, xmmword ptr [r9 + 2*r13 - 172752]
vpblendw xmm3, xmm3, xmm8, 128
vmovdqa64 ymm25, ymm3
vpbroadcastw ymm3, word ptr [r9 + 2*r13 - 198352]
vpermt2w ymm10, ymm11, ymm1
vpbroadcastd ymm11, dword ptr [r9 + 2*r13 - 121556]
vpunpcklwd xmm3, xmm3, xmm14
vpunpckldq xmm5, xmm3, xmm15
vmovdqa xmm3, xmmword ptr [r9 + 2*r13 - 179152]
vpblendd ymm10, ymm10, ymm11, 192
vpmovsxbw ymm11, xmmword ptr [rip + .LCPI1_114]
insertq xmm5, xmm3, 16, 48
vpunpcklqdq xmm6, xmm5, xmm4
vmovdqa xmm5, xmmword ptr [r9 + 2*r13 - 166352]
vpermt2w ymm10, ymm11, ymm0
vpblendd ymm9, ymm10, ymm9, 136
vpmovsxbw ymm10, xmmword ptr [rip + .LCPI1_115]
vmovdqa ymmword ptr [rsp + 160], ymm9
vpbroadcastd ymm9, dword ptr [r9 + 2*r13 - 134352]
vpbroadcastw xmm7, xmm5
vmovdqa xmm12, xmm5
vmovdqa xmm5, xmmword ptr [r9 + 2*r13 - 153552]
vpblendw xmm6, xmm6, xmm7, 32
vpbroadcastd xmm7, dword ptr [r9 + 2*r13 - 159952]
vpbroadcastw xmm8, xmm5
vpblendd xmm7, xmm6, xmm7, 8
vpblendw xmm7, xmm7, xmm8, 128
vmovdqa ymm8, ymmword ptr [r9 + 2*r13 - 147168]
vpblendd ymm7, ymm7, ymm8, 240
vpermt2w ymm21, ymm19, ymm8
vpermt2w ymm7, ymm10, ymm2
vpmovsxbw ymm10, xmmword ptr [rip + .LCPI1_116]
vpblendd ymm7, ymm7, ymm9, 32
vpbroadcastq ymm9, qword ptr [r9 + 2*r13 - 121552]
vpermt2w ymm7, ymm10, ymm1
vmovdqa ymm10, ymmword ptr [rsp + 416]
vpblendd ymm7, ymm7, ymm9, 192
vpmovsxbw ymm9, xmmword ptr [rip + .LCPI1_117]
vpermt2w ymm7, ymm9, ymm0
vpbroadcastd ymm9, dword ptr [r9 + 2*r13 - 108752]
vpblendd ymm7, ymm7, ymm9, 128
vpsrld xmm9, xmm4, 16
vmovdqa ymmword ptr [rsp + 480], ymm7
vmovdqa ymm7, ymmword ptr [r9 + 2*r13 - 179168]
vpermt2w ymm10, ymm26, ymm7
vpunpcklqdq xmm9, xmm10, xmm9
vpbroadcastw xmm10, word ptr [r9 + 2*r13 - 166350]
vpblendw xmm10, xmm9, xmm10, 32
vmovapd xmm9, xmmword ptr [r9 + 2*r13 - 159952]
vpslldq xmm11, xmm9, 10
vpblendd xmm10, xmm10, xmm11, 8
vpbroadcastw xmm11, word ptr [r9 + 2*r13 - 153550]
vpblendw xmm10, xmm10, xmm11, 128
vpshuflw ymm11, ymm8, 85
vpblendw ymm11, ymm11, ymm2, 2
vpblendd ymm6, ymm10, ymm11, 240
vmovdqa ymm10, ymmword ptr [rsp + 128]
vpmovsxbw ymm11, xmmword ptr [rip + .LCPI1_119]
vmovdqa64 ymm30, ymm6
vmovdqa xmm6, xmm12
vmovdqa64 xmm26, xmm6
vpermt2w ymm10, ymm28, ymm7
vshufps xmm7, xmm10, xmm4, 212
vpslldq xmm10, xmm12, 6
vpsrldq xmm12, xmm14, 10
vpblendw xmm7, xmm7, xmm10, 32
vpbroadcastd ymm10, dword ptr [r9 + 2*r13 - 159948]
vpblendd xmm7, xmm7, xmm10, 8
vpslldq xmm10, xmm5, 10
vpblendw xmm7, xmm7, xmm10, 128
vpbroadcastd ymm10, dword ptr [r9 + 2*r13 - 147148]
vpblendd ymm10, ymm7, ymm10, 240
vpmovsxbw ymm7, xmmword ptr [rip + .LCPI1_120]
vpermt2w ymm10, ymm7, ymm2
vmovdqa ymm7, ymmword ptr [r9 + 2*r13 - 134368]
vpblendd ymm10, ymm10, ymm7, 32
vpermt2w ymm10, ymm11, ymm1
vpbroadcastd ymm11, dword ptr [r9 + 2*r13 - 121548]
vpblendd ymm10, ymm10, ymm11, 192
vpmovsxbw ymm11, xmmword ptr [rip + .LCPI1_121]
vpermt2w ymm10, ymm11, ymm0
vpbroadcastd ymm11, dword ptr [r9 + 2*r13 - 108748]
vpblendd ymm10, ymm10, ymm11, 128
vpsrlq xmm11, xmm14, 48
vmovdqa ymmword ptr [rsp + 128], ymm10
vpsrlq xmm10, xmm13, 48
vpunpcklwd xmm10, xmm10, xmm11
vpsrlq xmm11, xmm15, 48
vpunpckldq xmm10, xmm10, xmm11
vpsrlq xmm11, xmm4, 48
vpblendw xmm10, xmm10, xmm3, 8
vpunpcklqdq xmm10, xmm10, xmm11
vpbroadcastw xmm11, word ptr [r9 + 2*r13 - 166346]
vpblendw xmm10, xmm10, xmm11, 32
vpmovzxwd xmm11, xmm9
vpblendd xmm10, xmm10, xmm11, 8
vpbroadcastw xmm11, word ptr [r9 + 2*r13 - 153546]
vpblendw xmm10, xmm10, xmm11, 128
vpsrldq xmm11, xmm13, 10
vpsrldq xmm13, xmm13, 14
vpunpcklwd xmm11, xmm11, xmm12
vpsrldq xmm12, xmm15, 10
vpunpckldq xmm11, xmm11, xmm12
vpbroadcastw xmm12, word ptr [r9 + 2*r13 - 179142]
vpblendw xmm11, xmm11, xmm12, 8
vpsrldq xmm12, xmm4, 10
vpunpcklqdq xmm11, xmm11, xmm12
vpsllq xmm12, xmm9, 16
vpsrld xmm9, xmm9, 16
vpblendw xmm11, xmm11, xmm6, 32
vpblendd xmm11, xmm11, xmm12, 8
vpbroadcastw xmm12, word ptr [r9 + 2*r13 - 153542]
vpblendw xmm11, xmm11, xmm12, 128
vpsrldq xmm12, xmm14, 14
vpunpcklwd xmm12, xmm13, xmm12
vpsrldq xmm13, xmm15, 14
vpmovsxbw xmm15, qword ptr [rip + .LCPI1_102]
vpunpckldq xmm12, xmm12, xmm13
vpbroadcastw xmm13, word ptr [r9 + 2*r13 - 179138]
vpblendw xmm12, xmm12, xmm13, 8
vpsrldq xmm13, xmm4, 14
vpunpcklqdq xmm12, xmm12, xmm13
vpbroadcastw xmm13, word ptr [r9 + 2*r13 - 166338]
vpblendw xmm12, xmm12, xmm13, 32
vpblendd xmm9, xmm12, xmm9, 8
vpmovsxbw ymm12, xmmword ptr [rip + .LCPI1_123]
vpblendw xmm9, xmm9, xmm5, 128
vpermt2w ymm31, ymm12, ymm8
vpmovsxbw ymm12, xmmword ptr [rip + .LCPI1_124]
vpermt2w ymm16, ymm12, ymm8
vpmovsxbw ymm12, xmmword ptr [rip + .LCPI1_125]
vpermt2w ymm25, ymm12, ymm8
vpmovsxbw ymm12, xmmword ptr [rip + .LCPI1_126]
vpermt2w ymm11, ymm12, ymm8
vpmovsxbw ymm12, xmmword ptr [rip + .LCPI1_127]
vpermt2w ymm9, ymm12, ymm8
vpbroadcastd ymm12, dword ptr [rip + .LCPI1_66]
vpermt2w ymm8, ymm12, ymm2
vpmovsxbw ymm12, xmmword ptr [rip + .LCPI1_128]
vpermt2w ymm8, ymm12, ymm7
vpmovsxbw ymm12, xmmword ptr [rip + .LCPI1_129]
vpblendw ymm8, ymm8, ymm1, 8
vpblendd ymm13, ymm10, ymm8, 240
vpbroadcastw ymm8, word ptr [r9 + 2*r13 - 198344]
vpbroadcastw xmm10, word ptr [r9 + 2*r13 - 191944]
vpunpcklwd xmm8, xmm8, xmm10
vunpcklps xmm8, xmm8, dword ptr [r9 + 2*r13 - 185544]{1to4}
vpsrldq xmm10, xmm3, 2
vpsrldq xmm3, xmm3, 6
vpblendw xmm8, xmm8, xmm10, 8
vpslld xmm10, xmm6, 16
vpblendd xmm8, xmm4, xmm8, 3
vpblendw xmm8, xmm8, xmm10, 32
vpbroadcastd xmm10, dword ptr [r9 + 2*r13 - 159944]
vpblendd xmm8, xmm8, xmm10, 8
vpsllq xmm10, xmm5, 48
vpblendw xmm8, xmm8, xmm10, 128
vpbroadcastq ymm10, qword ptr [r9 + 2*r13 - 147144]
vpblendd ymm8, ymm8, ymm10, 240
vpmovsxbw ymm10, xmmword ptr [rip + .LCPI1_130]
vpermt2w ymm8, ymm10, ymm2
vpbroadcastd ymm10, dword ptr [r9 + 2*r13 - 134344]
vpblendd ymm10, ymm8, ymm10, 32
vpmovsxbw ymm8, xmmword ptr [rip + .LCPI1_131]
vpermt2w ymm10, ymm8, ymm1
vmovdqa ymm8, ymmword ptr [r9 + 2*r13 - 121568]
vpblendd ymm10, ymm10, ymm8, 192
vpermt2w ymm10, ymm12, ymm0
vpbroadcastd ymm12, dword ptr [r9 + 2*r13 - 108744]
vpblendd ymm14, ymm10, ymm12, 128
vpmovsxbw ymm12, xmmword ptr [rip + .LCPI1_132]
vmovdqa ymm10, ymm11
vpermt2w ymm10, ymm12, ymm2
vpmovsxbw ymm12, xmmword ptr [rip + .LCPI1_133]
vpermt2w ymm10, ymm12, ymm7
vpmovsxbw ymm12, xmmword ptr [rip + .LCPI1_134]
vpermt2w ymm10, ymm12, ymm1
vpmovsxbw ymm12, xmmword ptr [rip + .LCPI1_135]
vpermt2w ymm10, ymm12, ymm8
vpbroadcastw xmm12, word ptr [r9 + 2*r13 - 191940]
vpblendw ymm10, ymm10, ymm0, 32
vpblendd ymm6, ymm11, ymm10, 240
vpbroadcastw ymm10, word ptr [r9 + 2*r13 - 198340]
vpbroadcastd ymm11, dword ptr [r9 + 2*r13 - 185540]
vpunpcklwd xmm10, xmm10, xmm12
vpmovsxbw ymm12, xmmword ptr [rip + .LCPI1_109]
vpblendd xmm10, xmm10, xmm11, 2
vmovdqa ymm11, ymmword ptr [r9 + 2*r13 - 38368]
vpblendw xmm3, xmm10, xmm3, 8
vmovaps xmm10, xmmword ptr [r9 + 2*r13 - 57568]
vshufps xmm3, xmm3, xmm4, 244
vpsrlq xmm4, xmm26, 16
vpblendw xmm3, xmm3, xmm4, 32
vpbroadcastd ymm4, dword ptr [r9 + 2*r13 - 159940]
vpblendd xmm3, xmm3, xmm4, 8
vpslld xmm4, xmm5, 16
vmovdqa64 ymm5, ymm21
vpermt2w ymm5, ymm27, ymm2
vpblendw xmm3, xmm3, xmm4, 128
vpbroadcastd ymm4, dword ptr [r9 + 2*r13 - 147140]
vpermt2w ymm5, ymm20, ymm7
vmovdqa64 ymm20, ymm30
vpmovsxbw ymm30, xmmword ptr [rip + .LCPI1_101]
vpblendd ymm3, ymm3, ymm4, 240
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_136]
vpermt2w ymm5, ymm30, ymm1
vmovdqa64 xmm30, xmmword ptr [r9 + 2*r13 - 51152]
vpermt2w ymm5, ymm23, ymm8
vmovdqa64 ymm23, ymm13
vmovdqa ymm13, ymmword ptr [r9 + 2*r13 - 25568]
vpermt2w ymm3, ymm4, ymm2
vpbroadcastd ymm4, dword ptr [r9 + 2*r13 - 134340]
vpblendd ymm3, ymm3, ymm4, 32
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_137]
vpermt2w ymm3, ymm4, ymm1
vpbroadcastd ymm4, dword ptr [r9 + 2*r13 - 121540]
vpblendd ymm3, ymm3, ymm4, 192
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_138]
vpermt2w ymm31, ymm4, ymm2
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_139]
vpermt2w ymm16, ymm4, ymm2
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_140]
vpermt2w ymm25, ymm4, ymm2
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_141]
vpermt2w ymm9, ymm4, ymm2
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_142]
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_148]
vpermt2w ymm31, ymm2, ymm7
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_143]
vpermt2w ymm16, ymm2, ymm7
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_144]
vpermt2w ymm25, ymm2, ymm7
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_145]
vpermt2w ymm20, ymm2, ymm7
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_146]
vpermt2w ymm9, ymm2, ymm7
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_147]
vpermt2w ymm31, ymm2, ymm1
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_149]
vpermt2w ymm16, ymm2, ymm1
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_150]
vpermt2w ymm25, ymm2, ymm1
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_151]
vpermt2w ymm20, ymm2, ymm1
vmovdqa ymm2, ymm9
vpermt2w ymm2, ymm4, ymm1
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_152]
vpermt2w ymm31, ymm1, ymm8
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_153]
vpermt2w ymm16, ymm1, ymm8
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_154]
vpermt2w ymm25, ymm1, ymm8
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_155]
vpermt2w ymm20, ymm1, ymm8
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_156]
vpermt2w ymm23, ymm1, ymm8
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_157]
vpermt2w ymm2, ymm1, ymm8
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_158]
vpbroadcastd ymm8, dword ptr [r9 + 2*r13 - 31968]
vpermt2w ymm5, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_160]
vmovdqa64 ymm27, ymm5
vpermt2w ymm31, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_161]
vpermt2w ymm16, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_162]
vmovdqa64 ymm21, ymm16
vmovdqa64 xmm16, xmmword ptr [r9 + 2*r13 - 19168]
vpermt2w ymm25, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_163]
vmovdqa64 ymm19, ymm25
vpmovsxbw ymm25, xmmword ptr [rip + .LCPI1_115]
vpermt2w ymm20, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_164]
vpermt2w ymm23, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_165]
vpermt2w ymm3, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_166]
vpermt2w ymm2, ymm1, ymm0
vmovdqa ymm0, ymmword ptr [r9 + 2*r13 - 108768]
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_159]
vpermt2w ymm27, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_167]
vpblendd ymm4, ymm3, ymm0, 128
vmovdqa ymm3, ymmword ptr [rsp + 224]
vpermt2w ymm31, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_169]
vpermt2w ymm21, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_170]
vpermt2w ymm19, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_171]
vpermt2w ymm20, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_172]
vpermt2w ymm23, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_173]
vpermt2w ymm6, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_168]
vpermt2w ymm2, ymm1, ymm0
vmovdqa ymm0, ymmword ptr [r9 + 2*r13 - 102368]
vmovdqa ymm1, ymmword ptr [rsp + 256]
vpermt2w ymm1, ymm22, ymm0
vmovdqa64 xmm22, xmmword ptr [rsp + 752]
vmovdqa ymmword ptr [rsp + 256], ymm1
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_174]
vpermt2w ymm27, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_175]
vpermt2w ymm3, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_176]
vmovdqa ymmword ptr [rsp + 224], ymm3
vmovdqa ymm3, ymmword ptr [rsp + 192]
vpermt2w ymm31, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_177]
vpermt2w ymm3, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_178]
vmovdqa ymmword ptr [rsp + 192], ymm3
vmovdqa ymm3, ymmword ptr [rsp + 160]
vpermt2w ymm21, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_179]
vpermt2w ymm3, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_180]
vmovdqa ymmword ptr [rsp + 160], ymm3
vmovdqa ymm3, ymmword ptr [rsp + 480]
vpermt2w ymm19, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_181]
vpermt2w ymm3, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_182]
vmovdqa ymmword ptr [rsp + 480], ymm3
vmovdqa ymm3, ymmword ptr [rsp + 128]
vpermt2w ymm20, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_183]
vpermt2w ymm3, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_184]
vmovdqa ymmword ptr [rsp + 128], ymm3
vpermt2w ymm23, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_185]
vpermt2w ymm14, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_186]
vmovdqa ymmword ptr [rsp + 448], ymm14
vmovdqa ymm14, ymmword ptr [r9 + 2*r13 - 12768]
vpermt2w ymm6, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_187]
vmovdqa ymmword ptr [rsp + 416], ymm6
vpermt2w ymm4, ymm1, ymm0
vmovdqa xmm1, xmmword ptr [r9 + 2*r13 - 83168]
vmovdqa ymmword ptr [rsp + 384], ymm4
vpblendw ymm4, ymm2, ymm0, 128
vmovdqa xmm0, xmmword ptr [r9 + 2*r13 - 95968]
vmovdqa xmm2, xmmword ptr [r9 + 2*r13 - 89568]
vpblendd ymm4, ymm9, ymm4, 240
vmovdqa xmm9, xmmword ptr [r9 + 2*r13 - 63968]
vmovdqa ymmword ptr [rsp + 352], ymm4
vmovdqa xmm4, xmmword ptr [r9 + 2*r13 - 70368]
vpunpcklwd xmm3, xmm0, xmm2
vpunpckldq xmm5, xmm3, xmm1
vmovdqa xmm3, xmmword ptr [r9 + 2*r13 - 76768]
vpbroadcastw xmm6, xmm9
insertq xmm5, xmm3, 16, 48
vpunpcklqdq xmm5, xmm5, xmm4
vpblendw xmm5, xmm5, xmm6, 32
vmovdqa xmm6, xmmword ptr [r9 + 2*r13 - 51168]
vinsertps xmm5, xmm5, xmm10, 48
vpbroadcastw xmm7, xmm6
vpblendw xmm7, xmm5, xmm7, 128
vinserti128 ymm5, ymm7, xmmword ptr [r9 + 2*r13 - 44768], 1
vinserti32x4 ymm7, ymm7, xmm16, 1
vpermt2w ymm5, ymm29, ymm11
vpblendd ymm8, ymm5, ymm8, 32
vpmovsxbw ymm5, xmmword ptr [rip + .LCPI1_95]
vpermt2w ymm8, ymm5, ymm13
vpmovsxbw ymm5, xmmword ptr [rip + .LCPI1_96]
vshufpd ymm7, ymm8, ymm7, 2
vpbroadcastd ymm8, dword ptr [r9 + 2*r13 - 6368]
vpermt2w ymm7, ymm5, ymm14
vpblendd ymm5, ymm7, ymm8, 128
vpsrld xmm7, xmm0, 16
vpsrld xmm8, xmm1, 16
vpblendw xmm7, xmm7, xmm2, 2
vmovdqa ymmword ptr [rsp + 960], ymm5
vpunpckldq xmm7, xmm7, xmm8
vpsrld xmm8, xmm4, 16
vpermt2w xmm7, xmm15, xmm3
vpbroadcastd ymm15, dword ptr [r9 + 2*r13 - 19164]
vpunpcklqdq xmm7, xmm7, xmm8
vpbroadcastw xmm8, word ptr [r9 + 2*r13 - 63966]
vpblendw xmm7, xmm7, xmm8, 32
vpslldq xmm8, xmm10, 10
vpblendd xmm7, xmm7, xmm8, 8
vpbroadcastw xmm8, word ptr [r9 + 2*r13 - 51166]
vpblendw xmm5, xmm7, xmm8, 128
vpbroadcastw xmm7, word ptr [r9 + 2*r13 - 89564]
vpbroadcastw xmm8, word ptr [r9 + 2*r13 - 95964]
vmovdqa64 ymm28, ymm5
vpmovsxbw ymm5, xmmword ptr [rip + .LCPI1_108]
vpunpcklwd xmm7, xmm8, xmm7
vpslldq xmm8, xmm9, 6
vpblendd xmm7, xmm7, xmm1, 2
vpermt2w xmm7, xmm17, xmm3
vpmovsxbw ymm17, xmmword ptr [rip + .LCPI1_123]
vshufps xmm7, xmm7, xmm4, 212
vpblendw xmm7, xmm7, xmm8, 32
vpslldq xmm8, xmm6, 10
vinsertps xmm7, xmm7, xmm10, 112
vpblendw xmm7, xmm7, xmm8, 128
vpbroadcastd ymm8, dword ptr [r9 + 2*r13 - 44764]
vpblendd ymm8, ymm7, ymm8, 240
vinserti128 ymm7, ymm7, xmmword ptr [r9 + 2*r13 - 31968], 1
vpermt2w ymm8, ymm18, ymm11
vpmovsxbw ymm18, xmmword ptr [rip + .LCPI1_103]
vpblendd ymm7, ymm8, ymm7, 34
vpbroadcastd ymm8, dword ptr [r9 + 2*r13 - 6364]
vpermt2w ymm7, ymm5, ymm13
vpblendd ymm7, ymm7, ymm15, 192
vpbroadcastw xmm15, word ptr [r9 + 2*r13 - 95960]
vpermt2w ymm7, ymm12, ymm14
vpblendd ymm5, ymm7, ymm8, 128
vpsrlq xmm7, xmm2, 48
vpsrlq xmm8, xmm0, 48
vpunpcklwd xmm7, xmm8, xmm7
vpsrlq xmm8, xmm1, 48
vmovdqa ymmword ptr [rsp + 928], ymm5
vpbroadcastq ymm5, qword ptr [r9 + 2*r13 - 44760]
vpunpckldq xmm7, xmm7, xmm8
vpsrlq xmm8, xmm4, 48
vpblendw xmm7, xmm7, xmm3, 8
vpunpcklqdq xmm7, xmm7, xmm8
vpbroadcastw xmm8, word ptr [r9 + 2*r13 - 63962]
vpblendw xmm7, xmm7, xmm8, 32
vpmovzxwd xmm8, xmm10
vpblendd xmm7, xmm7, xmm8, 8
vpbroadcastw xmm8, word ptr [r9 + 2*r13 - 51162]
vpblendw xmm12, xmm7, xmm8, 128
vpbroadcastw xmm7, word ptr [r9 + 2*r13 - 89560]
vpmovsxbw ymm8, xmmword ptr [rip + .LCPI1_111]
vpunpcklwd xmm7, xmm15, xmm7
vpsrldq xmm15, xmm3, 2
vpsrldq xmm3, xmm3, 6
vinsertps xmm7, xmm7, xmm1, 156
vpblendw xmm7, xmm7, xmm15, 8
vpslld xmm15, xmm9, 16
vpblendd xmm7, xmm4, xmm7, 3
vpblendw xmm7, xmm7, xmm15, 32
vpsllq xmm15, xmm6, 48
vinsertps xmm7, xmm7, xmm10, 176
vpblendw xmm7, xmm7, xmm15, 128
vpbroadcastd ymm15, dword ptr [r9 + 2*r13 - 31960]
vpblendd ymm5, ymm7, ymm5, 240
vinserti32x4 ymm7, ymm7, xmm16, 1
vpmovsxbw ymm16, xmmword ptr [rip + .LCPI1_112]
vpermt2w ymm5, ymm24, ymm11
vmovdqa64 xmm24, xmmword ptr [r9 + 2*r13 - 63952]
vpblendd ymm5, ymm5, ymm15, 32
vpbroadcastd ymm15, dword ptr [r9 + 2*r13 - 6360]
vpermt2w ymm5, ymm18, ymm13
vpmovsxbw ymm18, xmmword ptr [rip + .LCPI1_113]
vpblendd ymm5, ymm5, ymm7, 204
vpsrldq xmm7, xmm0, 10
vpsrldq xmm0, xmm0, 14
vpermt2w ymm5, ymm8, ymm14
vpblendd ymm5, ymm5, ymm15, 128
vpbroadcastw xmm15, word ptr [r9 + 2*r13 - 95956]
vmovdqa ymmword ptr [rsp + 896], ymm5
vpsrldq xmm5, xmm2, 10
vpsrldq xmm2, xmm2, 14
vpunpcklwd xmm5, xmm7, xmm5
vpsrldq xmm7, xmm1, 10
vpunpcklwd xmm0, xmm0, xmm2
vmovdqa ymm2, ymmword ptr [rsp + 1696]
vpunpckldq xmm5, xmm5, xmm7
vpbroadcastw xmm7, word ptr [r9 + 2*r13 - 76758]
vpblendw xmm5, xmm5, xmm7, 8
vpsrldq xmm7, xmm4, 10
vpunpcklqdq xmm5, xmm5, xmm7
vpsllq xmm7, xmm10, 16
vpblendw xmm5, xmm5, xmm9, 32
vpblendd xmm5, xmm5, xmm7, 8
vpbroadcastw xmm7, word ptr [r9 + 2*r13 - 51158]
vpblendw xmm8, xmm5, xmm7, 128
vpbroadcastw xmm5, word ptr [r9 + 2*r13 - 89556]
vpmovsxbw ymm7, xmmword ptr [rip + .LCPI1_114]
vpunpcklwd xmm5, xmm15, xmm5
vmovdqa xmm15, xmmword ptr [rsp + 768]
vinsertps xmm5, xmm5, xmm1, 220
vpsrldq xmm1, xmm1, 14
vpblendw xmm3, xmm5, xmm3, 8
vpsrlq xmm5, xmm9, 16
vpslld xmm9, xmm6, 16
vpunpckldq xmm0, xmm0, xmm1
vpbroadcastw xmm1, word ptr [r9 + 2*r13 - 76754]
vshufps xmm3, xmm3, xmm4, 244
vpblendw xmm3, xmm3, xmm5, 32
vpbroadcastd ymm5, dword ptr [r9 + 2*r13 - 44756]
vpblendd xmm3, xmm3, xmm10, 8
vpblendw xmm3, xmm3, xmm9, 128
vpbroadcastd ymm9, dword ptr [r9 + 2*r13 - 31956]
vpblendw xmm0, xmm0, xmm1, 8
vpsrldq xmm1, xmm4, 14
vmovdqa ymm4, ymmword ptr [r9 + 2*r13 - 31968]
vpunpcklqdq xmm0, xmm0, xmm1
vpbroadcastw xmm1, word ptr [r9 + 2*r13 - 63954]
vpblendd ymm5, ymm3, ymm5, 240
vinserti128 ymm3, ymm3, xmmword ptr [r9 + 2*r13 - 6368], 1
vpermt2w ymm5, ymm16, ymm11
vpmovsxbw ymm16, xmmword ptr [rip + .LCPI1_116]
vpblendd ymm5, ymm5, ymm9, 32
vpbroadcastd ymm9, dword ptr [r9 + 2*r13 - 19156]
vpermt2w ymm5, ymm18, ymm13
vpblendw xmm0, xmm0, xmm1, 32
vpsrld xmm1, xmm10, 16
vmovdqa ymm10, ymmword ptr [r9 + 2*r13 - 44768]
vpmovsxbw ymm18, xmmword ptr [rip + .LCPI1_117]
vpblendd xmm0, xmm0, xmm1, 8
vmovdqa xmm1, xmmword ptr [r9 + 2*r13 - 76752]
vpblendd ymm5, ymm5, ymm9, 192
vmovdqa xmm9, xmmword ptr [r9 + 2*r13 - 70352]
vpermt2w ymm12, ymm17, ymm10
vpmovsxbw ymm17, xmmword ptr [rip + .LCPI1_140]
vpermt2w ymm5, ymm7, ymm14
vmovdqa64 xmm29, xmm1
vpblendd ymm3, ymm5, ymm3, 136
vpblendw xmm5, xmm0, xmm6, 128
vpbroadcastw ymm0, word ptr [r9 + 2*r13 - 95952]
vpmovsxbw ymm6, xmmword ptr [rip + .LCPI1_119]
vmovdqa ymmword ptr [rsp + 832], ymm3
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_120]
vpunpcklwd xmm0, xmm0, xmm15
vpunpckldq xmm0, xmm0, xmm22
insertq xmm0, xmm1, 16, 48
vpbroadcastw xmm1, xmm24
vpunpcklqdq xmm0, xmm0, xmm9
vpblendw xmm0, xmm0, xmm1, 32
vpbroadcastd xmm1, dword ptr [r9 + 2*r13 - 57552]
vpblendd xmm0, xmm0, xmm1, 8
vpbroadcastw xmm1, xmm30
vpblendw xmm0, xmm0, xmm1, 128
vpbroadcastd ymm1, dword ptr [r9 + 2*r13 - 31952]
vpblendd ymm0, ymm0, ymm10, 240
vpermt2w ymm0, ymm25, ymm11
vpmovsxbw ymm25, xmmword ptr [rip + .LCPI1_121]
vpblendd ymm0, ymm0, ymm1, 32
vpbroadcastq ymm1, qword ptr [r9 + 2*r13 - 19152]
vpermt2w ymm0, ymm16, ymm13
vmovapd xmm16, xmmword ptr [r9 + 2*r13 - 57552]
vpblendd ymm0, ymm0, ymm1, 192
vpbroadcastd ymm1, dword ptr [r9 + 2*r13 - 6352]
vpermt2w ymm0, ymm18, ymm14
vpmovsxbw ymm18, xmmword ptr [rip + .LCPI1_128]
vpblendd ymm0, ymm0, ymm1, 128
vpmovsxbw xmm1, qword ptr [rip + .LCPI1_110]
vmovdqa ymmword ptr [rsp + 800], ymm0
vmovdqa ymm0, ymmword ptr [r9 + 2*r13 - 76768]
vpermt2w ymm2, ymm1, ymm0
vpsrld xmm1, xmm9, 16
vpunpcklqdq xmm1, xmm2, xmm1
vpbroadcastw xmm2, word ptr [r9 + 2*r13 - 63950]
vpblendw xmm1, xmm1, xmm2, 32
vpslldq xmm2, xmm16, 10
vpblendd xmm1, xmm1, xmm2, 8
vpbroadcastw xmm2, word ptr [r9 + 2*r13 - 51150]
vpblendw xmm1, xmm1, xmm2, 128
vpshuflw ymm2, ymm10, 85
vpblendw ymm2, ymm2, ymm11, 2
vpblendd ymm7, ymm1, ymm2, 240
vpmovsxbw xmm2, qword ptr [rip + .LCPI1_100]
vmovdqa ymm1, ymmword ptr [rsp + 1728]
vpermt2w ymm1, ymm2, ymm0
vpsrldq xmm2, xmm15, 10
vshufps xmm0, xmm1, xmm9, 212
vpslldq xmm1, xmm24, 6
vpblendw xmm0, xmm0, xmm1, 32
vpbroadcastd ymm1, dword ptr [r9 + 2*r13 - 57548]
vpblendd xmm0, xmm0, xmm1, 8
vpslldq xmm1, xmm30, 10
vpblendw xmm0, xmm0, xmm1, 128
vpbroadcastd ymm1, dword ptr [r9 + 2*r13 - 44748]
vpblendd ymm0, ymm0, ymm1, 240
vpbroadcastd ymm1, dword ptr [r9 + 2*r13 - 19148]
vpermt2w ymm0, ymm3, ymm11
vmovdqa xmm3, xmmword ptr [rsp + 784]
vpblendd ymm0, ymm0, ymm4, 32
vpermt2w ymm0, ymm6, ymm13
vmovdqa64 xmm6, xmm22
vpblendd ymm0, ymm0, ymm1, 192
vpbroadcastd ymm1, dword ptr [r9 + 2*r13 - 6348]
vpermt2w ymm0, ymm25, ymm14
vpblendd ymm0, ymm0, ymm1, 128
vpsrlq xmm1, xmm15, 48
vmovdqa64 ymm25, ymm0
vpsrlq xmm0, xmm3, 48
vpunpcklwd xmm0, xmm0, xmm1
vpsrlq xmm1, xmm22, 48
vmovapd xmm22, xmm15
vmovdqa xmm15, xmm6
vpunpckldq xmm0, xmm0, xmm1
vmovdqa64 xmm1, xmm29
vpblendw xmm0, xmm0, xmm1, 8
vpsrlq xmm1, xmm9, 48
vpunpcklqdq xmm0, xmm0, xmm1
vpbroadcastw xmm1, word ptr [r9 + 2*r13 - 63946]
vpblendw xmm0, xmm0, xmm1, 32
vpmovzxwd xmm1, xmm16
vpblendd xmm0, xmm0, xmm1, 8
vpbroadcastw xmm1, word ptr [r9 + 2*r13 - 51146]
vpblendw xmm0, xmm0, xmm1, 128
vpsrldq xmm1, xmm3, 10
vpunpcklwd xmm1, xmm1, xmm2
vpsrldq xmm2, xmm6, 10
vpunpckldq xmm1, xmm1, xmm2
vpbroadcastw xmm2, word ptr [r9 + 2*r13 - 76742]
vpblendw xmm1, xmm1, xmm2, 8
vpsrldq xmm2, xmm9, 10
vpunpcklqdq xmm1, xmm1, xmm2
vmovdqa64 xmm2, xmm24
vpblendw xmm1, xmm1, xmm2, 32
vpsllq xmm2, xmm16, 16
vpblendd xmm1, xmm1, xmm2, 8
vpbroadcastw xmm2, word ptr [r9 + 2*r13 - 51142]
vpblendw xmm6, xmm1, xmm2, 128
vpsrldq xmm1, xmm22, 14
vpsrldq xmm2, xmm3, 14
vpmovsxbw ymm3, xmmword ptr [rip + .LCPI1_125]
vmovdqa64 xmm22, xmm29
vpunpcklwd xmm1, xmm2, xmm1
vpsrldq xmm2, xmm15, 14
vmovdqa64 xmm15, xmm30
vpmovsxbw ymm30, xmmword ptr [rip + .LCPI1_131]
vpunpckldq xmm1, xmm1, xmm2
vpbroadcastw xmm2, word ptr [r9 + 2*r13 - 76738]
vpermt2w ymm5, ymm3, ymm10
vpblendw xmm1, xmm1, xmm2, 8
vpsrldq xmm2, xmm9, 14
vpunpcklqdq xmm1, xmm1, xmm2
vpbroadcastw xmm2, word ptr [r9 + 2*r13 - 63938]
vpermt2w ymm5, ymm17, ymm11
vpmovsxbw ymm17, xmmword ptr [rip + .LCPI1_145]
vpblendw xmm1, xmm1, xmm2, 32
vpsrld xmm2, xmm16, 16
vpmovsxbw ymm16, xmmword ptr [rip + .LCPI1_129]
vpblendd xmm1, xmm1, xmm2, 8
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_97]
vpermt2w ymm7, ymm17, ymm4
vpmovsxbw ymm17, xmmword ptr [rip + .LCPI1_151]
vpblendw xmm3, xmm1, xmm15, 128
vpbroadcastd ymm1, dword ptr [rip + .LCPI1_66]
vmovdqa64 ymm26, ymm2
vpermt2w ymm28, ymm2, ymm10
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_124]
vpermt2w ymm7, ymm17, ymm13
vpmovsxbw ymm17, xmmword ptr [rip + .LCPI1_148]
vpermt2w ymm8, ymm2, ymm10
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_126]
vpermt2w ymm6, ymm2, ymm10
vpmovsxbw ymm2, xmmword ptr [rip + .LCPI1_127]
vpermt2w ymm3, ymm2, ymm10
vpermt2w ymm10, ymm1, ymm11
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_130]
vpermt2w ymm10, ymm18, ymm4
vpmovsxbw ymm18, xmmword ptr [rip + .LCPI1_132]
vpblendw ymm2, ymm10, ymm13, 8
vpbroadcastw xmm10, word ptr [r9 + 2*r13 - 89544]
vpblendd ymm2, ymm0, ymm2, 240
vpbroadcastw ymm0, word ptr [r9 + 2*r13 - 95944]
vpunpcklwd xmm0, xmm0, xmm10
vunpcklps xmm0, xmm0, dword ptr [r9 + 2*r13 - 83144]{1to4}
vpsrldq xmm10, xmm29, 2
vmovdqa64 xmm29, xmm15
vpblendw xmm0, xmm0, xmm10, 8
vpslld xmm10, xmm24, 16
vpblendd xmm0, xmm9, xmm0, 3
vpblendw xmm0, xmm0, xmm10, 32
vpbroadcastd xmm10, dword ptr [r9 + 2*r13 - 57544]
vpblendd xmm0, xmm0, xmm10, 8
vpsllq xmm10, xmm15, 48
vpbroadcastd ymm15, dword ptr [r9 + 2*r13 - 6344]
vpblendw xmm0, xmm0, xmm10, 128
vpbroadcastq ymm10, qword ptr [r9 + 2*r13 - 44744]
vpblendd ymm0, ymm0, ymm10, 240
vpbroadcastd ymm10, dword ptr [r9 + 2*r13 - 31944]
vpermt2w ymm0, ymm1, ymm11
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_133]
vpblendd ymm10, ymm0, ymm10, 32
vmovdqa ymm0, ymmword ptr [r9 + 2*r13 - 19168]
vpermt2w ymm10, ymm30, ymm13
vpblendd ymm10, ymm10, ymm0, 192
vpermt2w ymm10, ymm16, ymm14
vpmovsxbw ymm16, xmmword ptr [rip + .LCPI1_134]
vpblendd ymm10, ymm10, ymm15, 128
vmovdqa ymm15, ymm6
vpermt2w ymm15, ymm18, ymm11
vpmovsxbw ymm18, xmmword ptr [rip + .LCPI1_135]
vpermt2w ymm15, ymm1, ymm4
vpbroadcastd ymm1, dword ptr [r9 + 2*r13 - 83140]
vpermt2w ymm15, ymm16, ymm13
vpbroadcastw xmm16, word ptr [r9 + 2*r13 - 89540]
vpermt2w ymm15, ymm18, ymm0
vpmovsxbw ymm18, xmmword ptr [rip + .LCPI1_136]
vpblendw ymm15, ymm15, ymm14, 32
vpblendd ymm6, ymm6, ymm15, 240
vpbroadcastw ymm15, word ptr [r9 + 2*r13 - 95940]
vpunpcklwd xmm15, xmm15, xmm16
vpmovsxbw ymm16, xmmword ptr [rip + .LCPI1_139]
vpblendd xmm1, xmm15, xmm1, 2
vpsrldq xmm15, xmm22, 6
vpblendw xmm1, xmm1, xmm15, 8
vpmovsxbw ymm15, xmmword ptr [rip + .LCPI1_138]
vshufps xmm1, xmm1, xmm9, 244
vpsrlq xmm9, xmm24, 16
vpblendw xmm1, xmm1, xmm9, 32
vpbroadcastd ymm9, dword ptr [r9 + 2*r13 - 57540]
vpermt2w ymm8, ymm16, ymm11
vpmovsxbw ymm16, xmmword ptr [rip + .LCPI1_143]
vpermt2w ymm12, ymm15, ymm11
vpmovsxbw ymm15, xmmword ptr [rip + .LCPI1_147]
vpblendd xmm1, xmm1, xmm9, 8
vpslld xmm9, xmm29, 16
vpblendw xmm1, xmm1, xmm9, 128
vpbroadcastd ymm9, dword ptr [r9 + 2*r13 - 44740]
vpermt2w ymm8, ymm16, ymm4
vpmovsxbw ymm16, xmmword ptr [rip + .LCPI1_150]
vpblendd ymm1, ymm1, ymm9, 240
vpbroadcastd ymm9, dword ptr [r9 + 2*r13 - 31940]
vpermt2w ymm1, ymm18, ymm11
vpmovsxbw ymm18, xmmword ptr [rip + .LCPI1_137]
vpblendd ymm1, ymm1, ymm9, 32
vpbroadcastd ymm9, dword ptr [r9 + 2*r13 - 19140]
vpermt2w ymm1, ymm18, ymm13
vpmovsxbw ymm18, xmmword ptr [rip + .LCPI1_141]
vpblendd ymm9, ymm1, ymm9, 192
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_98]
vpermt2w ymm3, ymm18, ymm11
vpmovsxbw ymm18, xmmword ptr [rip + .LCPI1_104]
vpermt2w ymm28, ymm1, ymm11
vmovdqa64 ymm22, ymm1
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_142]
vpmovsxbw ymm11, xmmword ptr [rip + .LCPI1_146]
vpermt2w ymm28, ymm18, ymm4
vmovdqa64 ymm18, ymmword ptr [rsp + 800]
vpermt2w ymm12, ymm1, ymm4
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_144]
vpermt2w ymm3, ymm11, ymm4
vpmovsxbw ymm11, xmmword ptr [rip + .LCPI1_101]
vpermt2w ymm12, ymm15, ymm13
vpmovsxbw ymm15, xmmword ptr [rip + .LCPI1_153]
vpermt2w ymm5, ymm1, ymm4
vpermt2w ymm28, ymm11, ymm13
vpmovsxbw ymm11, xmmword ptr [rip + .LCPI1_149]
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_158]
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_159]
vpermt2w ymm5, ymm16, ymm13
vpmovsxbw ymm16, xmmword ptr [rip + .LCPI1_154]
vpermt2w ymm8, ymm11, ymm13
vmovdqa ymm11, ymm3
vpermt2w ymm11, ymm17, ymm13
vpmovsxbw ymm13, xmmword ptr [rip + .LCPI1_152]
vpmovsxbw ymm17, xmmword ptr [rip + .LCPI1_118]
vpermt2w ymm5, ymm16, ymm0
vpmovsxbw ymm16, xmmword ptr [rip + .LCPI1_155]
vpermt2w ymm8, ymm15, ymm0
vpmovsxbw ymm15, xmmword ptr [rip + .LCPI1_161]
vpermt2w ymm28, ymm17, ymm0
vpermt2w ymm12, ymm13, ymm0
vpmovsxbw ymm13, xmmword ptr [rip + .LCPI1_160]
vmovdqa64 ymm17, ymmword ptr [rsp + 832]
vpermt2w ymm7, ymm16, ymm0
vpmovsxbw ymm16, xmmword ptr [rip + .LCPI1_156]
vpermt2w ymm28, ymm1, ymm14
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_163]
vpermt2w ymm8, ymm15, ymm14
vpmovsxbw ymm15, xmmword ptr [rip + .LCPI1_164]
vpermt2w ymm12, ymm13, ymm14
vpmovsxbw ymm13, xmmword ptr [rip + .LCPI1_167]
vpermt2w ymm2, ymm16, ymm0
vpmovsxbw ymm16, xmmword ptr [rip + .LCPI1_157]
vpermt2w ymm7, ymm1, ymm14
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_169]
vpermt2w ymm2, ymm15, ymm14
vmovdqa ymm15, ymmword ptr [rsp + 896]
vpermt2w ymm11, ymm16, ymm0
vpmovsxbw ymm0, xmmword ptr [rip + .LCPI1_162]
vmovdqa64 ymm16, ymm28
vpmovsxbw ymm28, xmmword ptr [rip + .LCPI1_95]
vpermt2w ymm5, ymm0, ymm14
vpmovsxbw ymm0, xmmword ptr [rip + .LCPI1_165]
vpermt2w ymm9, ymm0, ymm14
vpmovsxbw ymm0, xmmword ptr [rip + .LCPI1_166]
vpermt2w ymm11, ymm0, ymm14
vmovdqa ymm0, ymmword ptr [r9 + 2*r13 - 6368]
vmovdqa ymm14, ymmword ptr [rsp + 960]
vpermt2w ymm8, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_171]
vpermt2w ymm12, ymm13, ymm0
vpmovsxbw ymm13, xmmword ptr [rip + .LCPI1_170]
vpermt2w ymm16, ymm4, ymm0
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_174]
vpermt2w ymm7, ymm1, ymm0
vpmovsxbw ymm1, xmmword ptr [rip + .LCPI1_173]
vpermt2w ymm5, ymm13, ymm0
vpmovsxbw ymm13, xmmword ptr [rip + .LCPI1_172]
vpermt2w ymm6, ymm1, ymm0
vpblendd ymm1, ymm9, ymm0, 128
vpmovsxbw ymm9, xmmword ptr [rip + .LCPI1_168]
vpermt2w ymm2, ymm13, ymm0
vmovdqa ymm13, ymmword ptr [rsp + 928]
vpermt2w ymm11, ymm9, ymm0
vmovdqa ymm0, ymmword ptr [r9 + 2*r13 + 32]
vpmovsxbw ymm9, xmmword ptr [rip + .LCPI1_122]
add r13, 16
vpermt2w ymm16, ymm4, ymm0
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_175]
vpermt2w ymm14, ymm9, ymm0
vmovaps ymm9, ymmword ptr [rsp + 704]
vpermt2w ymm13, ymm4, ymm0
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_176]
vmovups ymmword ptr [r15 - 307680], ymm9
vmovaps ymm9, ymmword ptr [rsp + 672]
vpermt2w ymm12, ymm4, ymm0
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_177]
vpermt2w ymm15, ymm4, ymm0
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_178]
vpermt2w ymm8, ymm4, ymm0
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_179]
vpermt2w ymm17, ymm4, ymm0
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_180]
vpermt2w ymm5, ymm4, ymm0
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_181]
vpermt2w ymm18, ymm4, ymm0
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_182]
vpermt2w ymm7, ymm4, ymm0
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_183]
vpermt2w ymm25, ymm4, ymm0
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_184]
vpermt2w ymm2, ymm4, ymm0
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_185]
vpermt2w ymm10, ymm4, ymm0
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_186]
vpermt2w ymm6, ymm4, ymm0
vpmovsxbw ymm4, xmmword ptr [rip + .LCPI1_187]
vpermt2w ymm1, ymm4, ymm0
vmovaps ymm4, ymmword ptr [rsp + 1344]
vpblendw ymm0, ymm11, ymm0, 128
vpmovsxbw ymm11, xmmword ptr [rip + .LCPI1_96]
vpblendd ymm0, ymm3, ymm0, 240
vmovups ymmword ptr [r15 - 307648], ymm4
vmovaps ymm4, ymmword ptr [rsp + 1664]
vmovups ymmword ptr [r15 - 307616], ymm9
vmovaps ymm9, ymmword ptr [rsp + 1632]
vmovups ymmword ptr [r15 - 307584], ymm4
vmovaps ymm4, ymmword ptr [rsp + 1600]
vmovups ymmword ptr [r15 - 307552], ymm9
vmovaps ymm9, ymmword ptr [rsp + 640]
vmovups ymmword ptr [r15 - 307520], ymm4
vmovaps ymm4, ymmword ptr [rsp + 1568]
vmovups ymmword ptr [r15 - 307488], ymm9
vmovaps ymm9, ymmword ptr [rsp + 608]
vmovups ymmword ptr [r15 - 307456], ymm4
vmovaps ymm4, ymmword ptr [rsp + 1536]
vmovups ymmword ptr [r15 - 307424], ymm9
vmovaps ymm9, ymmword ptr [rsp + 320]
vmovups ymmword ptr [r15 - 307392], ymm4
vmovaps ymm4, ymmword ptr [rsp + 1504]
vmovups ymmword ptr [r15 - 307360], ymm9
vmovaps ymm9, ymmword ptr [rsp + 1472]
vmovups ymmword ptr [r15 - 307328], ymm4
vmovaps ymm4, ymmword ptr [rsp + 1440]
vmovups ymmword ptr [r15 - 307296], ymm9
vmovaps ymm9, ymmword ptr [rsp + 1408]
vmovups ymmword ptr [r15 - 307264], ymm4
vmovaps ymm4, ymmword ptr [rsp + 1376]
vmovups ymmword ptr [r15 - 307232], ymm9
vmovaps ymm9, ymmword ptr [rsp + 1088]
vmovups ymmword ptr [r15 - 307200], ymm4
vmovaps ymm4, ymmword ptr [rsp + 992]
vmovups ymmword ptr [r15 - 205280], ymm9
vmovaps ymm9, ymmword ptr [rsp + 576]
vmovups ymmword ptr [r15 - 205248], ymm4
vmovaps ymm4, ymmword ptr [rsp + 864]
vmovups ymmword ptr [r15 - 205216], ymm9
vmovaps ymm9, ymmword ptr [rsp + 1312]
vmovups ymmword ptr [r15 - 205184], ymm4
vmovaps ymm4, ymmword ptr [rsp + 1024]
vmovups ymmword ptr [r15 - 205152], ymm9
vmovaps ymm9, ymmword ptr [rsp + 544]
vmovups ymmword ptr [r15 - 205120], ymm4
vmovaps ymm4, ymmword ptr [rsp + 1280]
vmovups ymmword ptr [r15 - 205088], ymm9
vmovaps ymm9, ymmword ptr [rsp + 512]
vmovups ymmword ptr [r15 - 205056], ymm4
vmovaps ymm4, ymmword ptr [rsp + 1248]
vmovups ymmword ptr [r15 - 205024], ymm9
vmovaps ymm9, ymmword ptr [rsp + 288]
vmovups ymmword ptr [r15 - 204992], ymm4
vmovaps ymm4, ymmword ptr [rsp + 1216]
vmovups ymmword ptr [r15 - 204960], ymm9
vmovaps ymm9, ymmword ptr [rsp + 1184]
vmovups ymmword ptr [r15 - 204928], ymm4
vmovaps ymm4, ymmword ptr [rsp + 1152]
vmovups ymmword ptr [r15 - 204896], ymm9
vmovaps ymm9, ymmword ptr [rsp + 1120]
vmovups ymmword ptr [r15 - 204864], ymm4
vmovaps ymm4, ymmword ptr [rsp + 1056]
vmovups ymmword ptr [r15 - 204832], ymm9
vmovaps ymm9, ymmword ptr [rsp + 256]
vmovups ymmword ptr [r15 - 204800], ymm4
vmovaps ymm4, ymmword ptr [rsp + 224]
vmovups ymmword ptr [r15 - 102880], ymm9
vmovdqu64 ymmword ptr [r15 - 102848], ymm27
vmovaps ymm9, ymmword ptr [rsp + 192]
vmovups ymmword ptr [r15 - 102816], ymm4
vmovaps ymm4, ymmword ptr [rsp + 160]
vmovdqu64 ymmword ptr [r15 - 102784], ymm31
vmovups ymmword ptr [r15 - 102752], ymm9
vmovdqu64 ymmword ptr [r15 - 102720], ymm21
vmovaps ymm9, ymmword ptr [rsp + 480]
vmovups ymmword ptr [r15 - 102688], ymm4
vmovaps ymm4, ymmword ptr [rsp + 128]
vmovdqu64 ymmword ptr [r15 - 102656], ymm19
vmovups ymmword ptr [r15 - 102624], ymm9
vmovdqu64 ymmword ptr [r15 - 102592], ymm20
vmovaps ymm9, ymmword ptr [rsp + 448]
vmovups ymmword ptr [r15 - 102560], ymm4
vmovaps ymm4, ymmword ptr [rsp + 416]
vmovdqu64 ymmword ptr [r15 - 102528], ymm23
vmovups ymmword ptr [r15 - 102496], ymm9
vmovaps ymm9, ymmword ptr [rsp + 384]
vmovups ymmword ptr [r15 - 102464], ymm4
vmovaps ymm4, ymmword ptr [rsp + 352]
vmovups ymmword ptr [r15 - 102432], ymm9
vmovups ymmword ptr [r15 - 102400], ymm4
vmovdqu ymmword ptr [r15 - 480], ymm14
vmovdqu64 ymmword ptr [r15 - 448], ymm16
vmovdqu ymmword ptr [r15 - 416], ymm13
vmovdqu ymmword ptr [r15 - 384], ymm12
vmovdqu ymmword ptr [r15 - 352], ymm15
vmovdqu ymmword ptr [r15 - 320], ymm8
vmovdqu64 ymmword ptr [r15 - 288], ymm17
vmovdqu ymmword ptr [r15 - 256], ymm5
vmovdqu64 ymmword ptr [r15 - 224], ymm18
vmovdqu ymmword ptr [r15 - 192], ymm7
vmovdqu64 ymmword ptr [r15 - 160], ymm25
vmovdqu ymmword ptr [r15 - 128], ymm2
vmovdqu ymmword ptr [r15 - 96], ymm10
vmovdqu ymmword ptr [r15 - 64], ymm6
vmovdqu ymmword ptr [r15 - 32], ymm1
vmovdqu ymmword ptr [r15], ymm0
add r15, 512
cmp r13, 48
jb .LBB1_9
inc rsi
add r14, 55296000
cmp rsi, rdx
jne .LBB1_8
jmp .LBB1_11
.LBB1_14:
xor eax, eax
lea rsp, [rbp - 40]
.loc 1 4 3 epilogue_begin
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
.cfi_def_cfa rsp, 8
vzeroupper
ret
.Ltmp3:
.Lfunc_end1:
.size turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack, .Lfunc_end1-turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack
.cfi_endproc
.section .text.turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32,"ax",@progbits
.p2align 4, 0x90
.type turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32,@function
turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32:
.Lfunc_begin2:
.loc 1 1 0 is_stmt 1
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset rbp, -16
mov rbp, rsp
.cfi_def_cfa_register rbp
push r15
push r14
push r13
push r12
push rbx
sub rsp, 120
.cfi_offset rbx, -56
.cfi_offset r12, -48
.cfi_offset r13, -40
.cfi_offset r14, -32
.cfi_offset r15, -24
.Ltmp4:
.loc 1 4 3 prologue_end
mov r8, qword ptr [rsi + 24]
mov rax, rdx
movzx edx, word ptr [rdx + 8]
mov rcx, qword ptr [r8 + 24]
mov qword ptr [rbp - 48], rdx
mov qword ptr [rbp - 160], rcx
cmp rcx, rdx
jle .LBB2_15
.loc 1 0 3 is_stmt 0
mov edx, dword ptr [r8 + 32]
mov edi, dword ptr [r8 + 36]
movabs r10, 593736278999040
mov r12d, dword ptr [rax + 4]
mov rbx, qword ptr [rbp - 48]
mov r14d, dword ptr [rax]
mov rcx, qword ptr [rsi + 32]
mov r9d, 15361
mov r11d, 15106
mov r15d, dword ptr [rsi + 12]
bextr r9, qword ptr [r8], r9
bextr r8, qword ptr [r8 + 8], r11
.loc 1 4 3
imul r10, rdi
imul r13, rdx, 138240
imul r11, r14, 102400
mov qword ptr [rbp - 112], r14
mov r14, r14
shl r14, 10
shl rdi, 32
mov qword ptr [rbp - 88], r12
mov qword ptr [rbp - 216], r15
add r13, r10
imul r10, r12, 552960
mov rax, r13
imul rax, rbx
lea rax, [r10 + 4*rax]
imul r10, rbx, 55296000
add r14, rax
mov rax, qword ptr [rcx + 8]
add r11, r10
lea r10, [rdi + rdx]
add rdi, rdx
lea r11, [r11 + 2*r9]
lea r9, [r14 + 4*r8]
mov rdx, rdi
imul rdx, rbx
mov qword ptr [rbp - 104], r10
lea r14, [rax + 4*r8]
mov r8d, dword ptr [rsi + 16]
movzx esi, word ptr [rsi + 20]
add r9, rax
imul rax, rdx, 204800
imul rdx, r12, 204800
mov qword ptr [rbp - 56], r9
mov r9, qword ptr [rcx]
mov rcx, r15
shl rcx, 10
mov qword ptr [rbp - 208], rcx
imul rcx, r10, 138240
add rdx, rax
imul rdi, rsi
mov qword ptr [rbp - 120], rcx
imul rcx, rsi, 55296000
imul r13, rsi
mov qword ptr [rbp - 144], rsi
mov qword ptr [rbp - 192], r8
lea rax, [r9 + rdx + 60]
add r11, r9
mov qword ptr [rbp - 96], r11
imul rdx, rdi, 204800
mov qword ptr [rbp - 128], rcx
imul rcx, r8, 204800
shl r13, 2
mov qword ptr [rbp - 80], rax
mov qword ptr [rbp - 152], r13
mov qword ptr [rbp - 136], rdx
imul rdx, r8, 552960
mov qword ptr [rbp - 176], rcx
mov qword ptr [rbp - 184], rdx
imul rdx, r15, 102400
mov qword ptr [rbp - 200], rdx
jmp .LBB2_2
.p2align 4, 0x90
.LBB2_14:
.loc 1 0 3
mov rsi, qword ptr [rbp - 128]
mov rdx, qword ptr [rbp - 136]
mov rax, qword ptr [rbp - 48]
mov rcx, qword ptr [rbp - 56]
.loc 1 4 3
add rax, qword ptr [rbp - 144]
add rcx, qword ptr [rbp - 152]
add qword ptr [rbp - 96], rsi
add qword ptr [rbp - 80], rdx
mov qword ptr [rbp - 56], rcx
mov qword ptr [rbp - 48], rax
cmp rax, qword ptr [rbp - 160]
jge .LBB2_15
.LBB2_2:
.loc 1 0 3
mov rax, qword ptr [rbp - 104]
cmp rax, qword ptr [rbp - 88]
.loc 1 4 3
jle .LBB2_14
.loc 1 0 3
mov rax, qword ptr [rbp - 48]
mov r12, qword ptr [rbp - 80]
mov rdx, qword ptr [rbp - 56]
mov rcx, qword ptr [rbp - 88]
imul rax, qword ptr [rbp - 120]
mov qword ptr [rbp - 72], rdx
mov qword ptr [rbp - 64], rcx
mov qword ptr [rbp - 168], rax
jmp .LBB2_4
.p2align 4, 0x90
.LBB2_13:
mov rcx, qword ptr [rbp - 64]
mov rax, qword ptr [rbp - 72]
.loc 1 4 3
add r12, qword ptr [rbp - 176]
add rcx, qword ptr [rbp - 192]
add rax, qword ptr [rbp - 184]
mov qword ptr [rbp - 72], rax
mov qword ptr [rbp - 64], rcx
cmp rcx, qword ptr [rbp - 104]
jge .LBB2_14
.LBB2_4:
.loc 1 0 3
cmp dword ptr [rbp - 112], 539
.loc 1 4 3
ja .LBB2_13
.loc 1 0 3
imul rcx, qword ptr [rbp - 64], 138240
mov r11, qword ptr [rbp - 96]
mov rbx, qword ptr [rbp - 72]
mov r8, qword ptr [rbp - 112]
add rcx, qword ptr [rbp - 168]
.p2align 4, 0x90
.LBB2_6:
mov rax, r8
shl rax, 8
mov rdx, rbx
xor esi, esi
.p2align 4, 0x90
.LBB2_7:
xor edi, edi
.p2align 4, 0x90
.LBB2_8:
.loc 1 4 3
mov dword ptr [rdx + 4*rdi], 0
inc rdi
cmp rdi, 16
jne .LBB2_8
inc rsi
add rdx, 64
cmp rsi, 16
jne .LBB2_7
lea rdx, [rcx + rax]
lea rsi, [rcx + rax + 32]
lea r15, [rcx + rax + 160]
lea r10, [rcx + rax + 176]
lea rdi, [rcx + rax + 208]
lea r13, [rcx + rax + 224]
lea r9, [rcx + rax + 240]
mov qword ptr [rbp - 288], rdx
vmovups zmm0, zmmword ptr [r14 + 4*rdx]
lea rdx, [rcx + rax + 16]
mov qword ptr [rbp - 272], rsi
vmovups zmm1, zmmword ptr [r14 + 4*rsi]
lea rsi, [rcx + rax + 64]
vmovups zmm9, zmmword ptr [r14 + 4*r15]
vmovups zmm12, zmmword ptr [r14 + 4*r10]
vmovups zmm14, zmmword ptr [r14 + 4*rdi]
vmovups zmm13, zmmword ptr [r14 + 4*r13]
vmovups zmm15, zmmword ptr [r14 + 4*r9]
mov qword ptr [rbp - 280], rdx
vmovups zmm2, zmmword ptr [r14 + 4*rdx]
lea rdx, [rcx + rax + 48]
mov qword ptr [rbp - 256], rsi
vmovups zmm3, zmmword ptr [r14 + 4*rsi]
lea rsi, [rcx + rax + 96]
mov qword ptr [rbp - 264], rdx
vmovups zmm4, zmmword ptr [r14 + 4*rdx]
lea rdx, [rcx + rax + 80]
mov qword ptr [rbp - 240], rsi
vmovups zmm5, zmmword ptr [r14 + 4*rsi]
lea rsi, [rcx + rax + 128]
mov qword ptr [rbp - 248], rdx
vmovups zmm6, zmmword ptr [r14 + 4*rdx]
lea rdx, [rcx + rax + 112]
mov qword ptr [rbp - 224], rsi
vmovups zmm7, zmmword ptr [r14 + 4*rsi]
lea rsi, [rcx + rax + 144]
mov qword ptr [rbp - 232], rdx
vmovups zmm8, zmmword ptr [r14 + 4*rdx]
lea rdx, [rcx + rax + 192]
mov eax, 0
vmovups zmm10, zmmword ptr [r14 + 4*rsi]
vmovups zmm11, zmmword ptr [r14 + 4*rdx]
.p2align 4, 0x90
.LBB2_11:
.loc 1 0 3
vcvtph2ps zmm16, ymmword ptr [r11 + rax]
vfmadd231ps zmm0, zmm16, dword ptr [r12 + 2*rax - 60]{1to16}
vfmadd231ps zmm2, zmm16, dword ptr [r12 + 2*rax - 56]{1to16}
vfmadd231ps zmm1, zmm16, dword ptr [r12 + 2*rax - 52]{1to16}
vfmadd231ps zmm4, zmm16, dword ptr [r12 + 2*rax - 48]{1to16}
vfmadd231ps zmm3, zmm16, dword ptr [r12 + 2*rax - 44]{1to16}
vfmadd231ps zmm6, zmm16, dword ptr [r12 + 2*rax - 40]{1to16}
vfmadd231ps zmm5, zmm16, dword ptr [r12 + 2*rax - 36]{1to16}
vfmadd231ps zmm8, zmm16, dword ptr [r12 + 2*rax - 32]{1to16}
vfmadd231ps zmm7, zmm16, dword ptr [r12 + 2*rax - 28]{1to16}
vfmadd231ps zmm10, zmm16, dword ptr [r12 + 2*rax - 24]{1to16}
vfmadd231ps zmm9, zmm16, dword ptr [r12 + 2*rax - 20]{1to16}
vfmadd231ps zmm12, zmm16, dword ptr [r12 + 2*rax - 16]{1to16}
vfmadd231ps zmm11, zmm16, dword ptr [r12 + 2*rax - 12]{1to16}
vfmadd231ps zmm14, zmm16, dword ptr [r12 + 2*rax - 8]{1to16}
vfmadd231ps zmm13, zmm16, dword ptr [r12 + 2*rax - 4]{1to16}
vfmadd231ps zmm15, zmm16, dword ptr [r12 + 2*rax]{1to16}
.loc 1 4 3
add rax, 32
cmp rax, 102400
jne .LBB2_11
.loc 1 0 3
mov rax, qword ptr [rbp - 288]
.loc 1 4 3
add r8, qword ptr [rbp - 216]
add rbx, qword ptr [rbp - 208]
add r11, qword ptr [rbp - 200]
vmovups zmmword ptr [r14 + 4*rax], zmm0
mov rax, qword ptr [rbp - 280]
vmovups zmmword ptr [r14 + 4*rax], zmm2
mov rax, qword ptr [rbp - 272]
vmovups zmmword ptr [r14 + 4*rax], zmm1
mov rax, qword ptr [rbp - 264]
vmovups zmmword ptr [r14 + 4*rax], zmm4
mov rax, qword ptr [rbp - 256]
vmovups zmmword ptr [r14 + 4*rax], zmm3
mov rax, qword ptr [rbp - 248]
vmovups zmmword ptr [r14 + 4*rax], zmm6
mov rax, qword ptr [rbp - 240]
vmovups zmmword ptr [r14 + 4*rax], zmm5
mov rax, qword ptr [rbp - 232]
vmovups zmmword ptr [r14 + 4*rax], zmm8
mov rax, qword ptr [rbp - 224]
vmovups zmmword ptr [r14 + 4*rax], zmm7
vmovups zmmword ptr [r14 + 4*rsi], zmm10
vmovups zmmword ptr [r14 + 4*r15], zmm9
vmovups zmmword ptr [r14 + 4*r10], zmm12
vmovups zmmword ptr [r14 + 4*rdx], zmm11
vmovups zmmword ptr [r14 + 4*rdi], zmm14
vmovups zmmword ptr [r14 + 4*r13], zmm13
vmovups zmmword ptr [r14 + 4*r9], zmm15
cmp r8, 540
jl .LBB2_6
jmp .LBB2_13
.LBB2_15:
xor eax, eax
.loc 1 4 3 epilogue_begin
add rsp, 120
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
.cfi_def_cfa rsp, 8
vzeroupper
ret
.Ltmp5:
.Lfunc_end2:
.size turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32, .Lfunc_end2-turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32
.cfi_endproc
.section .text.turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32,"ax",@progbits
.p2align 4, 0x90
.type turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32,@function
turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32:
.Lfunc_begin3:
.loc 1 1 0 is_stmt 1
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset rbp, -16
mov rbp, rsp
.cfi_def_cfa_register rbp
.Ltmp6:
push r15
push r14
push r13
push r12
push rbx
sub rsp, 128
.cfi_offset rbx, -56
.cfi_offset r12, -48
.cfi_offset r13, -40
.cfi_offset r14, -32
.cfi_offset r15, -24
.loc 1 4 3 prologue_end
mov rcx, qword ptr [rsi + 24]
movzx edi, word ptr [rdx + 8]
mov r15d, dword ptr [rcx + 20]
mov r9d, dword ptr [rcx + 16]
mov r12, rdi
shl edi, 6
mov qword ptr [rbp - 56], rdi
shl r15, 32
lea rax, [r15 + r9]
mov qword ptr [rbp - 112], rax
cmp rdi, rax
jge .LBB3_22
.loc 1 0 3 is_stmt 0
mov eax, dword ptr [rsi + 12]
mov r10d, dword ptr [rcx + 24]
mov r11d, dword ptr [rcx + 28]
mov r13d, dword ptr [rdx]
mov edx, dword ptr [rdx + 4]
movabs rbx, 37108517437440
mov edi, dword ptr [rsi + 16]
movzx r8d, word ptr [rsi + 20]
mov rsi, qword ptr [rsi + 32]
.loc 1 4 3
or r15, r9
movabs r14, 148434069749760
sub r15, qword ptr [rbp - 56]
mov qword ptr [rbp - 48], rax
imul rbx, r11
imul rax, r10, 8640
imul r9, rdx, 2211840
imul r14, r11
shl r11, 32
shl rdx, 6
mov qword ptr [rbp - 216], r13
mov qword ptr [rbp - 104], rdx
add rax, rbx
imul rbx, r13, 3840
add rbx, r9
add rbx, qword ptr [rsi + 8]
mov r9, rax
imul r9, r12
mov rsi, qword ptr [rsi]
imul rax, r8
shl r9, 8
shl rax, 8
mov qword ptr [rbp - 160], rax
add rbx, r9
mov r9d, 15106
mov qword ptr [rbp - 64], rbx
lea rbx, [r11 + r10]
or r11, r10
imul r10, r10, 34560
bextr r9, qword ptr [rcx], r9
sub r11, rdx
mov rdx, qword ptr [rbp - 48]
mov qword ptr [rbp - 208], rbx
mov qword ptr [rbp - 136], r11
add r10, r14
mov qword ptr [rbp - 280], r10
mov r10, qword ptr [rcx + 8]
lea rcx, [rsi + 4*r9]
imul rsi, rdi, 2211840
shl rdi, 6
imul rax, rdx, 960
imul rdx, rdx, 3840
mov qword ptr [rbp - 192], rdi
mov qword ptr [rbp - 184], rcx
imul rcx, r13, 960
mov qword ptr [rbp - 200], rsi
imul r12, r10
imul r9, r10, 552960
imul r10, r8
shl r8d, 6
mov qword ptr [rbp - 248], rax
mov qword ptr [rbp - 240], rdx
mov qword ptr [rbp - 144], r8
mov qword ptr [rbp - 176], rcx
shl r12, 6
shl r10, 6
mov qword ptr [rbp - 272], r9
mov qword ptr [rbp - 120], r12
mov qword ptr [rbp - 152], r10
jmp .LBB3_2
.p2align 4, 0x90
.LBB3_21:
.loc 1 0 3
mov rdx, qword ptr [rbp - 152]
mov rsi, qword ptr [rbp - 64]
mov rax, qword ptr [rbp - 56]
mov rcx, qword ptr [rbp - 144]
mov r15, qword ptr [rbp - 168]
.loc 1 4 3
add rsi, qword ptr [rbp - 160]
add qword ptr [rbp - 120], rdx
add rax, rcx
sub r15, rcx
mov qword ptr [rbp - 64], rsi
mov qword ptr [rbp - 56], rax
cmp rax, qword ptr [rbp - 112]
jge .LBB3_22
.LBB3_2:
cmp r15, 64
mov ecx, 64
mov eax, 1
mov qword ptr [rbp - 168], r15
cmovl rcx, r15
cmp rcx, 2
cmovl rcx, rax
mov qword ptr [rbp - 288], rcx
cmp qword ptr [rbp - 104], rbx
jge .LBB3_21
.loc 1 0 3
mov rcx, qword ptr [rbp - 64]
mov rax, qword ptr [rbp - 112]
mov rdx, qword ptr [rbp - 104]
.loc 1 4 3
sub rax, qword ptr [rbp - 56]
mov qword ptr [rbp - 72], rcx
mov rcx, qword ptr [rbp - 136]
mov qword ptr [rbp - 256], rax
jmp .LBB3_4
.p2align 4, 0x90
.LBB3_20:
.loc 1 0 3
mov rsi, qword ptr [rbp - 72]
mov rax, qword ptr [rbp - 192]
mov rdx, qword ptr [rbp - 128]
mov rcx, qword ptr [rbp - 224]
mov rbx, qword ptr [rbp - 208]
.loc 1 4 3
add rsi, qword ptr [rbp - 200]
add rdx, rax
sub rcx, rax
mov qword ptr [rbp - 72], rsi
cmp rdx, rbx
jge .LBB3_21
.LBB3_4:
cmp rcx, 64
mov eax, 64
mov qword ptr [rbp - 128], rdx
mov qword ptr [rbp - 224], rcx
cmovl rax, rcx
sub rbx, rdx
mov edx, 64
cmp rbx, 64
mov qword ptr [rbp - 264], rax
mov qword ptr [rbp - 296], rbx
cmovl rdx, rbx
cmp dword ptr [rbp - 216], 8
ja .LBB3_20
.loc 1 0 3
mov rsi, qword ptr [rbp - 128]
mov rax, rsi
sar rax, 63
mov rcx, rax
xor rcx, rsi
lea rsi, [rcx + 15]
test rcx, rcx
cmovns rsi, rcx
mov rcx, qword ptr [rbp - 176]
sar rsi, 4
xor rsi, rax
.loc 1 4 3
add rsi, qword ptr [rbp - 120]
mov qword ptr [rbp - 80], rcx
imul rax, rsi, 552960
add rax, qword ptr [rbp - 184]
mov rsi, qword ptr [rbp - 72]
mov qword ptr [rbp - 232], rax
mov qword ptr [rbp - 88], rsi
jmp .LBB3_6
.p2align 4, 0x90
.LBB3_19:
.loc 1 0 3
mov rcx, qword ptr [rbp - 80]
mov rax, qword ptr [rbp - 88]
.loc 1 4 3
add rcx, qword ptr [rbp - 248]
add rax, qword ptr [rbp - 240]
mov qword ptr [rbp - 88], rax
mov qword ptr [rbp - 80], rcx
cmp rcx, 8640
jge .LBB3_20
.LBB3_6:
.loc 1 0 3
cmp qword ptr [rbp - 256], 0
.loc 1 4 3
jle .LBB3_19
.loc 1 0 3
mov rsi, qword ptr [rbp - 80]
mov rax, rsi
sar rax, 63
mov rcx, rax
xor rcx, rsi
lea rsi, [rcx + 15]
test rcx, rcx
cmovns rsi, rcx
shr rsi, 4
xor rsi, rax
mov rax, qword ptr [rbp - 88]
.loc 1 4 3
shl rsi, 10
add rsi, qword ptr [rbp - 232]
mov qword ptr [rbp - 96], rax
mov qword ptr [rbp - 48], rsi
xor esi, esi
jmp .LBB3_8
.p2align 4, 0x90
.LBB3_18:
.loc 1 0 3
mov rax, qword ptr [rbp - 96]
mov rcx, qword ptr [rbp - 48]
.loc 1 4 3
inc rsi
add rax, qword ptr [rbp - 280]
add rcx, qword ptr [rbp - 272]
mov qword ptr [rbp - 96], rax
mov qword ptr [rbp - 48], rcx
cmp rsi, qword ptr [rbp - 288]
je .LBB3_19
.LBB3_8:
.loc 1 0 3
cmp qword ptr [rbp - 296], 0
.loc 1 4 3
jle .LBB3_18
.loc 1 0 3
mov rbx, qword ptr [rbp - 48]
mov r12, qword ptr [rbp - 96]
mov r9, qword ptr [rbp - 264]
xor r13d, r13d
jmp .LBB3_10
.p2align 4, 0x90
.LBB3_17:
.loc 1 4 3
add r13, 16
add r9, -16
add r12, 552960
add rbx, 552960
cmp r13, rdx
jge .LBB3_18
.LBB3_10:
cmp r9, 16
mov r10d, 16
mov eax, 1
mov r14, rbx
mov rcx, r12
cmovl r10, r9
cmp r10, 2
cmovl r10, rax
xor eax, eax
jmp .LBB3_11
.p2align 4, 0x90
.LBB3_16:
lea rdi, [rax + 16]
add rcx, 64
add r14, 1024
cmp rax, 944
mov rax, rdi
jae .LBB3_17
.LBB3_11:
.loc 1 0 3
cmp rdx, r13
.loc 1 4 3
jle .LBB3_16
.loc 1 0 3
mov rdi, r14
mov r15, rcx
xor r11d, r11d
.p2align 4, 0x90
.LBB3_13:
xor r8d, r8d
.p2align 4, 0x90
.LBB3_14:
.loc 1 4 3
vmovss xmm0, dword ptr [rdi + 4*r8]
vmovss dword ptr [r15 + 4*r8], xmm0
inc r8
cmp r8, 16
jne .LBB3_14
inc r11
add r15, 34560
add rdi, 64
cmp r11, r10
jne .LBB3_13
jmp .LBB3_16
.LBB3_22:
xor eax, eax
.loc 1 4 3 epilogue_begin
add rsp, 128
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
.cfi_def_cfa rsp, 8
ret
.Ltmp7:
.Lfunc_end3:
.size turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32, .Lfunc_end3-turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32
.cfi_endproc
.section .text.iree_hal_executable_library_query,"ax",@progbits
.globl iree_hal_executable_library_query
.p2align 4, 0x90
.type iree_hal_executable_library_query,@function
iree_hal_executable_library_query:
.Liree_hal_executable_library_query$local:
.type .Liree_hal_executable_library_query$local,@function
.Lfunc_begin4:
.cfi_startproc
xor eax, eax
cmp edi, 4
lea rcx, [rip + iree_hal_executable_library_query_v0]
cmove rax, rcx
ret
.Lfunc_end4:
.size iree_hal_executable_library_query, .Lfunc_end4-iree_hal_executable_library_query
.size .Liree_hal_executable_library_query$local, .Lfunc_end4-iree_hal_executable_library_query
.cfi_endproc
.section .text.iree_h2f_ieee,"ax",@progbits
.p2align 4, 0x90
.type iree_h2f_ieee,@function
iree_h2f_ieee:
.Lfunc_begin5:
.cfi_startproc
mov eax, edi
and eax, 32768
mov edx, edi
mov ecx, edi
and ecx, 1023
shl eax, 16
and dx, 31744
je .LBB5_6
and edi, 31744
cmp edi, 31744
jne .LBB5_5
test cx, cx
je .LBB5_4
or eax, 2143289344
vmovd xmm0, eax
ret
.LBB5_6:
movzx ecx, cx
or eax, 864026624
vcvtsi2ss xmm0, xmm0, ecx
vmovd xmm1, eax
vmulss xmm0, xmm0, xmm1
ret
.LBB5_5:
movzx ecx, cx
movzx edx, dx
add edx, ecx
shl edx, 13
lea eax, [rdx + rax + 939524096]
vmovd xmm0, eax
ret
.LBB5_4:
or eax, 2139095040
vmovd xmm0, eax
ret
.Lfunc_end5:
.size iree_h2f_ieee, .Lfunc_end5-iree_h2f_ieee
.cfi_endproc
.section .text.iree_f2h_ieee,"ax",@progbits
.p2align 4, 0x90
.type iree_f2h_ieee,@function
iree_f2h_ieee:
.Lfunc_begin6:
.cfi_startproc
vmovd edi, xmm0
mov edx, 2071
bextr esi, edi, edx
mov eax, edi
mov ecx, edi
shr ecx, 31
and eax, 8388607
lea edx, [rsi - 112]
lea r8d, [rsi - 113]
cmp r8d, 28
ja .LBB6_2
add eax, 4096
shl ecx, 15
shl edx, 10
shr eax, 13
or eax, ecx
or eax, edx
ret
.LBB6_2:
test edi, edi
je .LBB6_6
cmp esi, 112
ja .LBB6_7
cmp esi, 102
jae .LBB6_10
.LBB6_6:
xor eax, eax
ret
.LBB6_7:
cmp edx, 143
jne .LBB6_11
shl ecx, 15
test eax, eax
je .LBB6_14
shr eax, 13
or ecx, eax
or ecx, 31744
mov eax, ecx
ret
.LBB6_10:
mov dl, 113
or eax, 8388608
shl ecx, 15
sub dl, sil
shrx eax, eax, edx
mov edx, eax
and edx, 4096
lea eax, [rax + 2*rdx]
shr eax, 13
or eax, ecx
ret
.LBB6_11:
test edi, 4096
je .LBB6_13
lea edi, [rax + 8192]
add esi, -111
xor r8d, r8d
cmp eax, 8380416
cmovb esi, edx
cmovb r8d, edi
mov eax, r8d
mov edx, esi
.LBB6_13:
shl ecx, 15
cmp edx, 31
jb .LBB6_15
.LBB6_14:
or ecx, 31744
mov eax, ecx
ret
.LBB6_15:
shr eax, 13
shl edx, 10
or eax, ecx
or eax, edx
ret
.Lfunc_end6:
.size iree_f2h_ieee, .Lfunc_end6-iree_f2h_ieee
.cfi_endproc
.section .text.__gnu_h2f_ieee,"ax",@progbits
.p2align 4, 0x90
.type __gnu_h2f_ieee,@function
__gnu_h2f_ieee:
.Lfunc_begin7:
.cfi_startproc
mov eax, edi
and eax, 32768
mov edx, edi
mov ecx, edi
and ecx, 1023
shl eax, 16
and dx, 31744
je .LBB7_6
and edi, 31744
cmp edi, 31744
jne .LBB7_5
test cx, cx
je .LBB7_4
or eax, 2143289344
vmovd xmm0, eax
ret
.LBB7_6:
movzx ecx, cx
or eax, 864026624
vcvtsi2ss xmm0, xmm0, ecx
vmovd xmm1, eax
vmulss xmm0, xmm0, xmm1
ret
.LBB7_5:
movzx ecx, cx
movzx edx, dx
add edx, ecx
shl edx, 13
lea eax, [rdx + rax + 939524096]
vmovd xmm0, eax
ret
.LBB7_4:
or eax, 2139095040
vmovd xmm0, eax
ret
.Lfunc_end7:
.size __gnu_h2f_ieee, .Lfunc_end7-__gnu_h2f_ieee
.cfi_endproc
.section .text.__extendhfsf2,"ax",@progbits
.p2align 4, 0x90
.type __extendhfsf2,@function
__extendhfsf2:
.Lfunc_begin8:
.cfi_startproc
vmovd ecx, xmm0
mov eax, ecx
shl eax, 16
mov edx, ecx
and edx, 1023
mov esi, ecx
and eax, -2147483648
and esi, 31744
je .LBB8_6
cmp esi, 31744
jne .LBB8_5
test dx, dx
je .LBB8_4
or eax, 2143289344
vmovd xmm0, eax
ret
.LBB8_6:
movzx ecx, dx
or eax, 864026624
vcvtsi2ss xmm0, xmm1, ecx
vmovd xmm1, eax
vmulss xmm0, xmm0, xmm1
ret
.LBB8_5:
and ecx, 32767
shl ecx, 13
lea eax, [rcx + rax + 939524096]
vmovd xmm0, eax
ret
.LBB8_4:
or eax, 2139095040
vmovd xmm0, eax
ret
.Lfunc_end8:
.size __extendhfsf2, .Lfunc_end8-__extendhfsf2
.cfi_endproc
.section .text.__gnu_f2h_ieee,"ax",@progbits
.p2align 4, 0x90
.type __gnu_f2h_ieee,@function
__gnu_f2h_ieee:
.Lfunc_begin9:
.cfi_startproc
jmp iree_f2h_ieee
.Lfunc_end9:
.size __gnu_f2h_ieee, .Lfunc_end9-__gnu_f2h_ieee
.cfi_endproc
.section .text.__truncsfhf2,"ax",@progbits
.p2align 4, 0x90
.type __truncsfhf2,@function
__truncsfhf2:
.Lfunc_begin10:
.cfi_startproc
push rax
.cfi_def_cfa_offset 16
call iree_f2h_ieee
mov word ptr [rsp + 4], ax
vmovss xmm0, dword ptr [rsp + 4]
pop rax
.cfi_def_cfa_offset 8
ret
.Lfunc_end10:
.size __truncsfhf2, .Lfunc_end10-__truncsfhf2
.cfi_endproc
.section .rodata.cst4,"aM",@progbits,4
.p2align 2, 0x0
.LCPI11_0:
.long 0x7b800000
.LCPI11_1:
.long 0x80000000
.LCPI11_2:
.long 0x3f800000
.section .text.ceilf,"ax",@progbits
.p2align 4, 0x90
.type ceilf,@function
ceilf:
.Lfunc_begin11:
.cfi_startproc
vmovd eax, xmm0
mov ecx, 2071
bextr ecx, eax, ecx
cmp ecx, 149
ja .LBB11_7
cmp ecx, 127
jb .LBB11_4
add ecx, -127
mov edx, 8388607
shrx edx, edx, ecx
test edx, eax
je .LBB11_7
vaddss xmm0, xmm0, dword ptr [rip + .LCPI11_0]
xor esi, esi
test eax, eax
mov edi, -8388608
cmovs edx, esi
sarx ecx, edi, ecx
add edx, eax
and edx, ecx
vmovss dword ptr [rsp - 8], xmm0
vmovd xmm0, edx
ret
.LBB11_4:
vaddss xmm1, xmm0, dword ptr [rip + .LCPI11_0]
vmovss dword ptr [rsp - 4], xmm1
test eax, eax
js .LBB11_5
vmovss xmm1, dword ptr [rip + .LCPI11_2]
sete al
kmovd k1, eax
vmovss xmm1 {k1}, xmm1, xmm0
vmovaps xmm0, xmm1
.LBB11_7:
ret
.LBB11_5:
vmovss xmm0, dword ptr [rip + .LCPI11_1]
ret
.Lfunc_end11:
.size ceilf, .Lfunc_end11-ceilf
.cfi_endproc
.section .rodata.cst4,"aM",@progbits,4
.p2align 2, 0x0
.LCPI12_0:
.long 0x42b17217
.LCPI12_1:
.long 0xc2cff1b4
.LCPI12_2:
.long 0x10000000
.LCPI12_3:
.long 0x70000000
.section .rodata.cst8,"aM",@progbits,8
.p2align 3, 0x0
.LCPI12_4:
.quad 0x40471547652b82fe
.LCPI12_5:
.quad 0x4338000000000000
.LCPI12_6:
.quad 0xc338000000000000
.LCPI12_7:
.quad 0x3ebc6af84b912394
.LCPI12_8:
.quad 0x3f2ebfce50fac4f3
.LCPI12_9:
.quad 0x3f962e42ff0c52d6
.LCPI12_10:
.quad 0x3ff0000000000000
.section .text.expf,"ax",@progbits
.p2align 4, 0x90
.type expf,@function
expf:
.Lfunc_begin12:
.cfi_startproc
vmovd ecx, xmm0
mov eax, 2836
bextr eax, ecx, eax
cmp eax, 1067
jae .LBB12_1
.LBB12_8:
vcvtss2sd xmm0, xmm0, xmm0
vmulsd xmm0, xmm0, qword ptr [rip + .LCPI12_4]
lea rdx, [rip + __exp2f_data]
vaddsd xmm1, xmm0, qword ptr [rip + .LCPI12_5]
vmovq rax, xmm1
vaddsd xmm1, xmm1, qword ptr [rip + .LCPI12_6]
mov ecx, eax
and ecx, 31
shl rax, 47
add rax, qword ptr [rdx + 8*rcx]
vsubsd xmm0, xmm0, xmm1
vmulsd xmm2, xmm0, qword ptr [rip + .LCPI12_7]
vmovq xmm1, rax
vmulsd xmm3, xmm0, xmm0
vaddsd xmm2, xmm2, qword ptr [rip + .LCPI12_8]
vmulsd xmm0, xmm0, qword ptr [rip + .LCPI12_9]
vaddsd xmm0, xmm0, qword ptr [rip + .LCPI12_10]
vmulsd xmm2, xmm3, xmm2
vaddsd xmm0, xmm0, xmm2
vmulsd xmm0, xmm0, xmm1
vcvtsd2ss xmm1, xmm0, xmm0
.LBB12_9:
vmovaps xmm0, xmm1
ret
.LBB12_1:
vxorps xmm1, xmm1, xmm1
cmp ecx, -8388608
je .LBB12_9
cmp eax, 2040
jae .LBB12_3
vucomiss xmm0, dword ptr [rip + .LCPI12_0]
jbe .LBB12_6
mov dword ptr [rsp - 8], 1879048192
vmovss xmm0, dword ptr [rsp - 8]
vmulss xmm0, xmm0, dword ptr [rip + .LCPI12_3]
ret
.LBB12_3:
vaddss xmm0, xmm0, xmm0
ret
.LBB12_6:
vmovss xmm1, dword ptr [rip + .LCPI12_1]
vucomiss xmm1, xmm0
jbe .LBB12_8
mov dword ptr [rsp - 4], 268435456
vmovss xmm0, dword ptr [rsp - 4]
vmulss xmm0, xmm0, dword ptr [rip + .LCPI12_2]
ret
.Lfunc_end12:
.size expf, .Lfunc_end12-expf
.cfi_endproc
.section .rodata.cst8,"aM",@progbits,8
.p2align 2, 0x0
.LCPI13_0:
.long 0xf0000000
.long 0x70000000
.section .rodata.cst4,"aM",@progbits,4
.p2align 2, 0x0
.LCPI13_1:
.long 0x70000000
.section .text.__math_oflowf,"ax",@progbits
.p2align 4, 0x90
.type __math_oflowf,@function
__math_oflowf:
.Lfunc_begin13:
.cfi_startproc
xor eax, eax
test edi, edi
lea rcx, [rip + .LCPI13_0]
sete al
vmovss xmm0, dword ptr [rcx + 4*rax]
vmovss dword ptr [rsp - 4], xmm0
vmovss xmm0, dword ptr [rsp - 4]
vmulss xmm0, xmm0, dword ptr [rip + .LCPI13_1]
ret
.Lfunc_end13:
.size __math_oflowf, .Lfunc_end13-__math_oflowf
.cfi_endproc
.section .rodata.cst8,"aM",@progbits,8
.p2align 2, 0x0
.LCPI14_0:
.long 0x90000000
.long 0x10000000
.section .rodata.cst4,"aM",@progbits,4
.p2align 2, 0x0
.LCPI14_1:
.long 0x10000000
.section .text.__math_uflowf,"ax",@progbits
.p2align 4, 0x90
.type __math_uflowf,@function
__math_uflowf:
.Lfunc_begin14:
.cfi_startproc
xor eax, eax
test edi, edi
lea rcx, [rip + .LCPI14_0]
sete al
vmovss xmm0, dword ptr [rcx + 4*rax]
vmovss dword ptr [rsp - 4], xmm0
vmovss xmm0, dword ptr [rsp - 4]
vmulss xmm0, xmm0, dword ptr [rip + .LCPI14_1]
ret
.Lfunc_end14:
.size __math_uflowf, .Lfunc_end14-__math_uflowf
.cfi_endproc
.section .rodata.cst4,"aM",@progbits,4
.p2align 2, 0x0
.LCPI15_0:
.long 0x80000000
.section .text.__math_xflowf,"ax",@progbits
.p2align 4, 0x90
.type __math_xflowf,@function
__math_xflowf:
.Lfunc_begin15:
.cfi_startproc
vxorps xmm1, xmm0, dword ptr [rip + .LCPI15_0]{1to4}
test edi, edi
sete al
kmovd k1, eax
vmovss xmm1 {k1}, xmm1, xmm0
vmovss dword ptr [rsp - 4], xmm1
vmulss xmm0, xmm0, dword ptr [rsp - 4]
ret
.Lfunc_end15:
.size __math_xflowf, .Lfunc_end15-__math_xflowf
.cfi_endproc
.section .text.feclearexcept,"ax",@progbits
.p2align 4, 0x90
.type feclearexcept,@function
feclearexcept:
.Lfunc_begin16:
.cfi_startproc
xor eax, eax
ret
.Lfunc_end16:
.size feclearexcept, .Lfunc_end16-feclearexcept
.cfi_endproc
.section .text.feraiseexcept,"ax",@progbits
.p2align 4, 0x90
.type feraiseexcept,@function
feraiseexcept:
.Lfunc_begin17:
.cfi_startproc
xor eax, eax
ret
.Lfunc_end17:
.size feraiseexcept, .Lfunc_end17-feraiseexcept
.cfi_endproc
.section .text.fetestexcept,"ax",@progbits
.p2align 4, 0x90
.type fetestexcept,@function
fetestexcept:
.Lfunc_begin18:
.cfi_startproc
xor eax, eax
ret
.Lfunc_end18:
.size fetestexcept, .Lfunc_end18-fetestexcept
.cfi_endproc
.section .text.fegetround,"ax",@progbits
.p2align 4, 0x90
.type fegetround,@function
fegetround:
.Lfunc_begin19:
.cfi_startproc
xor eax, eax
ret
.Lfunc_end19:
.size fegetround, .Lfunc_end19-fegetround
.cfi_endproc
.section .text.__fesetround,"ax",@progbits
.p2align 4, 0x90
.type __fesetround,@function
__fesetround:
.Lfunc_begin20:
.cfi_startproc
xor eax, eax
ret
.Lfunc_end20:
.size __fesetround, .Lfunc_end20-__fesetround
.cfi_endproc
.section .text.fegetenv,"ax",@progbits
.p2align 4, 0x90
.type fegetenv,@function
fegetenv:
.Lfunc_begin21:
.cfi_startproc
xor eax, eax
ret
.Lfunc_end21:
.size fegetenv, .Lfunc_end21-fegetenv
.cfi_endproc
.section .text.fesetenv,"ax",@progbits
.p2align 4, 0x90
.type fesetenv,@function
fesetenv:
.Lfunc_begin22:
.cfi_startproc
xor eax, eax
ret
.Lfunc_end22:
.size fesetenv, .Lfunc_end22-fesetenv
.cfi_endproc
.section .rodata.cst4,"aM",@progbits,4
.p2align 2, 0x0
.LCPI23_0:
.long 0x7b800000
.LCPI23_1:
.long 0xbf800000
.section .text.floorf,"ax",@progbits
.p2align 4, 0x90
.type floorf,@function
floorf:
.Lfunc_begin23:
.cfi_startproc
vmovd eax, xmm0
mov ecx, 2071
bextr ecx, eax, ecx
cmp ecx, 149
jbe .LBB23_1
ret
.LBB23_1:
cmp ecx, 127
jb .LBB23_4
add ecx, -127
mov edx, 8388607
shrx edx, edx, ecx
test edx, eax
je .LBB23_6
vaddss xmm0, xmm0, dword ptr [rip + .LCPI23_0]
mov esi, -8388608
sarx ecx, esi, ecx
mov esi, eax
sar esi, 31
and esi, edx
add esi, eax
and esi, ecx
vmovss dword ptr [rsp - 8], xmm0
vmovd xmm0, esi
ret
.LBB23_4:
vaddss xmm1, xmm0, dword ptr [rip + .LCPI23_0]
vmovss dword ptr [rsp - 4], xmm1
vxorps xmm1, xmm1, xmm1
test eax, eax
js .LBB23_7
vmovaps xmm0, xmm1
.LBB23_6:
ret
.LBB23_7:
vcmpeqss k1, xmm0, xmm1
vmovss xmm1, dword ptr [rip + .LCPI23_1]
vmovss xmm1 {k1}, xmm1, xmm0
vmovaps xmm0, xmm1
ret
.Lfunc_end23:
.size floorf, .Lfunc_end23-floorf
.cfi_endproc
.section .text.fmaf,"ax",@progbits
.p2align 4, 0x90
.type fmaf,@function
fmaf:
.Lfunc_begin24:
.cfi_startproc
vcvtss2sd xmm0, xmm0, xmm0
movabs rdx, 9218868437227405312
vcvtss2sd xmm1, xmm1, xmm1
vcvtss2sd xmm2, xmm2, xmm2
vmulsd xmm1, xmm0, xmm1
vaddsd xmm0, xmm1, xmm2
vmovq rax, xmm0
mov ecx, eax
and ecx, 536870911
cmp ecx, 268435456
setne cl
andn rdx, rax, rdx
sete dl
or dl, cl
jne .LBB24_4
vsubsd xmm3, xmm0, xmm1
vucomisd xmm3, xmm2
jne .LBB24_3
jp .LBB24_3
vsubsd xmm3, xmm0, xmm2
vucomisd xmm3, xmm1
jne .LBB24_3
jp .LBB24_3
.LBB24_4:
vcvtsd2ss xmm0, xmm0, xmm0
ret
.LBB24_3:
test rax, rax
vsubsd xmm3, xmm1, xmm0
vsubsd xmm0, xmm2, xmm0
sets cl
vucomisd xmm2, xmm1
vaddsd xmm0, xmm1, xmm0
vaddsd xmm3, xmm3, xmm2
vxorpd xmm1, xmm1, xmm1
setbe dl
xor dl, cl
kmovd k1, edx
vmovsd xmm0 {k1}, xmm0, xmm3
vucomisd xmm1, xmm0
setbe dl
xor dl, cl
movzx ecx, dl
dec rcx
or rcx, 1
add rcx, rax
vmovq xmm0, rcx
vcvtsd2ss xmm0, xmm0, xmm0
ret
.Lfunc_end24:
.size fmaf, .Lfunc_end24-fmaf
.cfi_endproc
.section .text.fmodf,"ax",@progbits
.p2align 4, 0x90
.type fmodf,@function
fmodf:
.Lfunc_begin25:
.cfi_startproc
vmovd edx, xmm1
mov esi, edx
add esi, edx
je .LBB25_2
mov r8d, edx
vmovd eax, xmm0
mov edi, 2071
and r8d, 2147483647
bextr ecx, eax, edi
cmp r8d, 2139095041
setae r8b
cmp ecx, 255
sete r9b
or r9b, r8b
cmp r9b, 1
jne .LBB25_3
.LBB25_2:
vmulss xmm0, xmm0, xmm1
vdivss xmm0, xmm0, xmm0
ret
.LBB25_3:
lea r8d, [rax + rax]
cmp r8d, esi
jbe .LBB25_4
bextr edi, edx, edi
test ecx, ecx
je .LBB25_6
mov esi, eax
and esi, 8388607
or esi, 8388608
test edi, edi
je .LBB25_11
.LBB25_14:
and edx, 8388607
or edx, 8388608
cmp ecx, edi
jg .LBB25_16
.LBB25_21:
mov edi, esi
sub edi, edx
jns .LBB25_22
jmp .LBB25_23
.LBB25_4:
vpxor xmm1, xmm1, xmm1
sete al
vmulss xmm1, xmm0, xmm1
kmovd k1, eax
vmovss xmm0 {k1}, xmm0, xmm1
ret
.LBB25_6:
mov esi, eax
xor ecx, ecx
shl esi, 9
js .LBB25_8
.p2align 4, 0x90
.LBB25_7:
dec ecx
add esi, esi
jns .LBB25_7
.LBB25_8:
mov sil, 1
sub sil, cl
shlx esi, eax, esi
test edi, edi
jne .LBB25_14
.LBB25_11:
mov r8d, edx
xor edi, edi
shl r8d, 9
js .LBB25_13
.p2align 4, 0x90
.LBB25_12:
dec edi
add r8d, r8d
jns .LBB25_12
.LBB25_13:
mov r8b, 1
sub r8b, dil
shlx edx, edx, r8d
cmp ecx, edi
jg .LBB25_16
jmp .LBB25_21
.p2align 4, 0x90
.LBB25_19:
add esi, esi
dec ecx
cmp ecx, edi
jle .LBB25_20
.LBB25_16:
mov r8d, esi
sub r8d, edx
js .LBB25_19
mov esi, r8d
jne .LBB25_19
jmp .LBB25_18
.LBB25_20:
mov ecx, edi
mov edi, esi
sub edi, edx
js .LBB25_23
.LBB25_22:
mov esi, edi
je .LBB25_18
.LBB25_23:
cmp esi, 8388607
ja .LBB25_24
.p2align 4, 0x90
.LBB25_25:
lea edx, [rsi + rsi]
dec ecx
cmp esi, 4194304
mov esi, edx
jb .LBB25_25
and eax, -2147483648
test ecx, ecx
jle .LBB25_28
.LBB25_27:
add edx, -8388608
shl ecx, 23
or ecx, edx
or ecx, eax
vmovd xmm0, ecx
ret
.LBB25_18:
vpxor xmm1, xmm1, xmm1
vmulss xmm0, xmm0, xmm1
ret
.LBB25_24:
mov edx, esi
and eax, -2147483648
test ecx, ecx
jg .LBB25_27
.LBB25_28:
mov sil, 1
sub sil, cl
shrx ecx, edx, esi
or ecx, eax
vmovd xmm0, ecx
ret
.Lfunc_end25:
.size fmodf, .Lfunc_end25-fmodf
.cfi_endproc
.section .text.__math_invalidf,"ax",@progbits
.p2align 4, 0x90
.type __math_invalidf,@function
__math_invalidf:
.Lfunc_begin26:
.cfi_startproc
vsubss xmm0, xmm0, xmm0
vdivss xmm0, xmm0, xmm0
ret
.Lfunc_end26:
.size __math_invalidf, .Lfunc_end26-__math_invalidf
.cfi_endproc
.section .rodata.cst4,"aM",@progbits,4
.p2align 2, 0x0
.LCPI27_0:
.long 0x3f800000
.LCPI27_1:
.long 0x80000000
.LCPI27_2:
.long 0x4b000000
.LCPI27_12:
.long 0x10000000
.LCPI27_20:
.long 0x70000000
.section .rodata.cst8,"aM",@progbits,8
.p2align 3, 0x0
.LCPI27_3:
.quad 0xbff0000000000000
.LCPI27_4:
.quad 0x3fd27616c9496e0b
.LCPI27_5:
.quad 0xbfd71969a075c67a
.LCPI27_6:
.quad 0x3fdec70a6ca7badd
.LCPI27_7:
.quad 0xbfe7154748bef6c8
.LCPI27_8:
.quad 0x3ff71547652ab82b
.LCPI27_9:
.quad 0x405fffffffd1d571
.LCPI27_10:
.quad 0xc062c00000000000
.LCPI27_11:
.long 0x90000000
.long 0x10000000
.LCPI27_13:
.quad 0x42e8000000000000
.LCPI27_14:
.quad 0xc2e8000000000000
.LCPI27_15:
.quad 0x3fac6af84b912394
.LCPI27_16:
.quad 0x3fcebfce50fac4f3
.LCPI27_17:
.quad 0x3fe62e42ff0c52d6
.LCPI27_18:
.quad 0x3ff0000000000000
.LCPI27_19:
.long 0xf0000000
.long 0x70000000
.section .text.powf,"ax",@progbits
.p2align 4, 0x90
.type powf,@function
powf:
.Lfunc_begin27:
.cfi_startproc
vmovd edx, xmm0
vmovd eax, xmm1
lea ecx, [rdx - 2139095040]
cmp ecx, -2130706432
jb .LBB27_2
lea esi, [rax + rax + 16777216]
xor ecx, ecx
cmp esi, 16777216
jbe .LBB27_2
.LBB27_24:
lea eax, [rdx - 1060306944]
mov esi, eax
mov edi, eax
shr esi, 19
and edi, -8388608
sar eax, 23
sub edx, edi
shl esi, 4
lea rdi, [rip + __powf_log2_data]
vmovd xmm0, edx
movzx esi, sil
movabs rdx, 9223231299366420480
vcvtss2sd xmm0, xmm0, xmm0
vmulsd xmm0, xmm0, qword ptr [rsi + rdi]
vaddsd xmm0, xmm0, qword ptr [rip + .LCPI27_3]
vmulsd xmm5, xmm0, qword ptr [rip + .LCPI27_6]
vmulsd xmm4, xmm0, qword ptr [rip + .LCPI27_4]
vaddsd xmm5, xmm5, qword ptr [rip + .LCPI27_7]
vaddsd xmm4, xmm4, qword ptr [rip + .LCPI27_5]
vcvtsi2sd xmm2, xmm2, eax
vmulsd xmm3, xmm0, xmm0
vaddsd xmm2, xmm2, qword ptr [rsi + rdi + 8]
vmulsd xmm0, xmm0, qword ptr [rip + .LCPI27_8]
movabs rsi, 4638426141214900225
vmulsd xmm6, xmm3, xmm3
vmulsd xmm3, xmm3, xmm5
vmulsd xmm4, xmm4, xmm6
vaddsd xmm0, xmm2, xmm0
vaddsd xmm0, xmm0, xmm3
vaddsd xmm0, xmm4, xmm0
vcvtss2sd xmm1, xmm1, xmm1
vmulsd xmm0, xmm0, xmm1
vmovq rax, xmm0
and rdx, rax
cmp rdx, rsi
jae .LBB27_25
.LBB27_29:
vaddsd xmm1, xmm0, qword ptr [rip + .LCPI27_13]
lea rdx, [rip + __exp2f_data]
vmovq rax, xmm1
vaddsd xmm1, xmm1, qword ptr [rip + .LCPI27_14]
add ecx, eax
and eax, 31
shl rcx, 47
add rcx, qword ptr [rdx + 8*rax]
vsubsd xmm0, xmm0, xmm1
vmulsd xmm2, xmm0, qword ptr [rip + .LCPI27_15]
vmovq xmm1, rcx
vmulsd xmm3, xmm0, xmm0
vaddsd xmm2, xmm2, qword ptr [rip + .LCPI27_16]
vmulsd xmm0, xmm0, qword ptr [rip + .LCPI27_17]
vaddsd xmm0, xmm0, qword ptr [rip + .LCPI27_18]
vmulsd xmm2, xmm3, xmm2
vaddsd xmm0, xmm0, xmm2
vmulsd xmm0, xmm0, xmm1
vcvtsd2ss xmm0, xmm0, xmm0
.LBB27_30:
ret
.LBB27_2:
lea ecx, [rax + rax]
lea esi, [rcx - 1]
cmp esi, -16777217
jae .LBB27_3
lea ecx, [rdx + rdx - 1]
cmp ecx, -16777217
jae .LBB27_10
xor ecx, ecx
test edx, edx
js .LBB27_16
cmp edx, 8388607
ja .LBB27_24
.LBB27_23:
vmulss xmm0, xmm0, dword ptr [rip + .LCPI27_2]
vmovd edx, xmm0
and edx, 2147483647
add edx, -192937984
jmp .LBB27_24
.LBB27_25:
vucomisd xmm0, qword ptr [rip + .LCPI27_9]
jbe .LBB27_27
xor eax, eax
test ecx, ecx
lea rcx, [rip + .LCPI27_19]
sete al
vmovss xmm0, dword ptr [rcx + 4*rax]
vmovss dword ptr [rsp - 8], xmm0
vmovss xmm0, dword ptr [rsp - 8]
vmulss xmm0, xmm0, dword ptr [rip + .LCPI27_20]
ret
.LBB27_16:
mov ecx, 2071
bextr ecx, eax, ecx
cmp ecx, 127
jb .LBB27_31
cmp ecx, 150
jbe .LBB27_18
.LBB27_20:
xor ecx, ecx
.LBB27_21:
vmovd edx, xmm0
and edx, 2147483647
cmp edx, 8388607
ja .LBB27_24
jmp .LBB27_23
.LBB27_27:
vmovsd xmm1, qword ptr [rip + .LCPI27_10]
vucomisd xmm1, xmm0
jb .LBB27_29
xor eax, eax
test ecx, ecx
lea rcx, [rip + .LCPI27_11]
sete al
vmovss xmm0, dword ptr [rcx + 4*rax]
vmovss dword ptr [rsp - 4], xmm0
vmovss xmm0, dword ptr [rsp - 4]
vmulss xmm0, xmm0, dword ptr [rip + .LCPI27_12]
ret
.LBB27_18:
mov dl, -106
sub dl, cl
bzhi ecx, eax, edx
je .LBB27_19
.LBB27_31:
vsubss xmm0, xmm0, xmm0
vdivss xmm0, xmm0, xmm0
ret
.LBB27_19:
mov ecx, 1
shlx edx, ecx, edx
mov ecx, 65536
test edx, eax
jne .LBB27_21
jmp .LBB27_20
.LBB27_3:
vmovss xmm2, dword ptr [rip + .LCPI27_0]
test ecx, ecx
sete sil
cmp edx, 1065353216
sete dil
or dil, sil
je .LBB27_5
vmovaps xmm0, xmm2
ret
.LBB27_10:
vmulss xmm0, xmm0, xmm0
test edx, edx
jns .LBB27_13
mov ecx, 2071
bextr ecx, eax, ecx
lea edx, [rcx - 151]
cmp edx, -24
jb .LBB27_13
vxorps xmm1, xmm0, dword ptr [rip + .LCPI27_1]{1to4}
mov dl, -106
sub dl, cl
bzhi ecx, eax, edx
movzx edx, dl
setne cl
bt eax, edx
setae dl
kmovd k1, ecx
kmovd k2, edx
vmovss xmm1 {k2}, xmm1, xmm0
vmovss xmm1 {k1}, xmm1, xmm0
vmovaps xmm0, xmm1
.LBB27_13:
test eax, eax
jns .LBB27_30
vmovss xmm1, dword ptr [rip + .LCPI27_0]
vdivss xmm0, xmm1, xmm0
vmovss dword ptr [rsp - 12], xmm0
vmovss xmm0, dword ptr [rsp - 12]
ret
.LBB27_5:
add edx, edx
cmp edx, -16777215
setae sil
cmp ecx, -16777215
setae cl
or cl, sil
cmp cl, 1
jne .LBB27_7
vaddss xmm0, xmm0, xmm1
ret
.LBB27_7:
vmovaps xmm0, xmm2
cmp edx, 2130706432
je .LBB27_30
setae cl
test eax, eax
vmulss xmm0, xmm1, xmm1
vxorps xmm1, xmm1, xmm1
setns al
xor al, cl
kmovd k1, eax
vmovss xmm0 {k1}, xmm0, xmm1
ret
.Lfunc_end27:
.size powf, .Lfunc_end27-powf
.cfi_endproc
.section .rodata.cst4,"aM",@progbits,4
.p2align 2, 0x0
.LCPI28_0:
.long 0x7fffffff
.LCPI28_1:
.long 0x4b000000
.LCPI28_2:
.long 0xcb000000
.LCPI28_3:
.long 0x3f000000
.LCPI28_4:
.long 0xbf000000
.LCPI28_5:
.long 0x3f800000
.LCPI28_6:
.long 0xbf800000
.LCPI28_7:
.long 0x80000000
.section .text.roundf,"ax",@progbits
.p2align 4, 0x90
.type roundf,@function
roundf:
.Lfunc_begin28:
.cfi_startproc
vmovd eax, xmm0
mov ecx, 2071
bextr ecx, eax, ecx
cmp ecx, 149
ja .LBB28_8
vpandd xmm1, xmm0, dword ptr [rip + .LCPI28_0]{1to4}
vaddss xmm2, xmm1, dword ptr [rip + .LCPI28_1]
cmp ecx, 125
ja .LBB28_3
vxorps xmm1, xmm1, xmm1
vmovss dword ptr [rsp - 4], xmm2
vmulss xmm0, xmm0, xmm1
ret
.LBB28_3:
vaddss xmm0, xmm2, dword ptr [rip + .LCPI28_2]
vsubss xmm0, xmm0, xmm1
vucomiss xmm0, dword ptr [rip + .LCPI28_3]
jbe .LBB28_5
vaddss xmm0, xmm1, xmm0
vaddss xmm0, xmm0, dword ptr [rip + .LCPI28_6]
jmp .LBB28_7
.LBB28_5:
vmovss xmm2, dword ptr [rip + .LCPI28_4]
vucomiss xmm2, xmm0
vaddss xmm0, xmm1, xmm0
jb .LBB28_7
vaddss xmm0, xmm0, dword ptr [rip + .LCPI28_5]
.LBB28_7:
vxorps xmm1, xmm0, dword ptr [rip + .LCPI28_7]{1to4}
test eax, eax
sets al
kmovd k1, eax
vmovss xmm0 {k1}, xmm0, xmm1
.LBB28_8:
ret
.Lfunc_end28:
.size roundf, .Lfunc_end28-roundf
.cfi_endproc
.type __unnamed_1,@object
.section .rodata.__unnamed_1,"a",@progbits
__unnamed_1:
.asciz "mmt3d_kernel_linked_llvm_cpu"
.size __unnamed_1, 29
.type iree_hal_executable_library_query_v0_header,@object
.section .data.rel.ro.iree_hal_executable_library_query_v0_header,"aw",@progbits
.p2align 4, 0x0
iree_hal_executable_library_query_v0_header:
.long 4
.zero 4
.quad __unnamed_1
.long 0
.long 0
.size iree_hal_executable_library_query_v0_header, 24
.type iree_hal_executable_library_query_v0_funcs,@object
.section .data.rel.ro.iree_hal_executable_library_query_v0_funcs,"aw",@progbits
.p2align 4, 0x0
iree_hal_executable_library_query_v0_funcs:
.quad turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32
.quad turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack
.quad turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32
.quad turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32
.size iree_hal_executable_library_query_v0_funcs, 32
.type iree_hal_executable_library_query_v0_attrs,@object
.section .rodata.iree_hal_executable_library_query_v0_attrs,"a",@progbits
.p2align 3, 0x0
iree_hal_executable_library_query_v0_attrs:
.zero 16
.size iree_hal_executable_library_query_v0_attrs, 16
.type __unnamed_2,@object
.section .rodata.__unnamed_2,"a",@progbits
__unnamed_2:
.asciz "turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32"
.size __unnamed_2, 58
.type __unnamed_3,@object
.section .rodata.__unnamed_3,"a",@progbits
__unnamed_3:
.asciz "turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack"
.size __unnamed_3, 78
.type __unnamed_4,@object
.section .rodata.__unnamed_4,"a",@progbits
__unnamed_4:
.asciz "turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32"
.size __unnamed_4, 94
.type __unnamed_5,@object
.section .rodata.__unnamed_5,"a",@progbits
__unnamed_5:
.asciz "turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32"
.size __unnamed_5, 60
.type iree_hal_executable_library_query_v0_names,@object
.section .data.rel.ro.iree_hal_executable_library_query_v0_names,"aw",@progbits
.p2align 4, 0x0
iree_hal_executable_library_query_v0_names:
.quad __unnamed_2
.quad __unnamed_3
.quad __unnamed_4
.quad __unnamed_5
.size iree_hal_executable_library_query_v0_names, 32
.type __unnamed_6,@object
.section .rodata.__unnamed_6,"a",@progbits
__unnamed_6:
.asciz "mmt3d_kernel.mlir"
.size __unnamed_6, 18
.type __unnamed_7,@object
.section .rodata.__unnamed_7,"a",@progbits
__unnamed_7:
.asciz "mmt3d_kernel.mlir"
.size __unnamed_7, 18
.type __unnamed_8,@object
.section .rodata.__unnamed_8,"a",@progbits
__unnamed_8:
.asciz "mmt3d_kernel.mlir"
.size __unnamed_8, 18
.type __unnamed_9,@object
.section .rodata.__unnamed_9,"a",@progbits
__unnamed_9:
.asciz "mmt3d_kernel.mlir"
.size __unnamed_9, 18
.type iree_hal_executable_library_query_v0_source_locations,@object
.section .data.rel.ro.iree_hal_executable_library_query_v0_source_locations,"aw",@progbits
.p2align 4, 0x0
iree_hal_executable_library_query_v0_source_locations:
.long 4
.long 17
.quad __unnamed_6
.long 4
.long 17
.quad __unnamed_7
.long 4
.long 17
.quad __unnamed_8
.long 4
.long 17
.quad __unnamed_9
.size iree_hal_executable_library_query_v0_source_locations, 64
.type iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32_stage_names,@object
.section .rodata.iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32_stage_names,"a",@progbits
.p2align 3, 0x0
iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32_stage_names:
.size iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32_stage_names, 0
.type iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32_stage_source_locations,@object
.section .rodata.iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32_stage_source_locations,"a",@progbits
.p2align 3, 0x0
iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32_stage_source_locations:
.size iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32_stage_source_locations, 0
.type iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack_stage_names,@object
.section .rodata.iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack_stage_names,"a",@progbits
.p2align 3, 0x0
iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack_stage_names:
.size iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack_stage_names, 0
.type iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack_stage_source_locations,@object
.section .rodata.iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack_stage_source_locations,"a",@progbits
.p2align 3, 0x0
iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack_stage_source_locations:
.size iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack_stage_source_locations, 0
.type iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32_stage_names,@object
.section .rodata.iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32_stage_names,"a",@progbits
.p2align 3, 0x0
iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32_stage_names:
.size iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32_stage_names, 0
.type iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32_stage_source_locations,@object
.section .rodata.iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32_stage_source_locations,"a",@progbits
.p2align 3, 0x0
iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32_stage_source_locations:
.size iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32_stage_source_locations, 0
.type iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32_stage_names,@object
.section .rodata.iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32_stage_names,"a",@progbits
.p2align 3, 0x0
iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32_stage_names:
.size iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32_stage_names, 0
.type iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32_stage_source_locations,@object
.section .rodata.iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32_stage_source_locations,"a",@progbits
.p2align 3, 0x0
iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32_stage_source_locations:
.size iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32_stage_source_locations, 0
.type iree_hal_executable_library_query_v0_stage_location_tables,@object
.section .data.rel.ro.iree_hal_executable_library_query_v0_stage_location_tables,"aw",@progbits
.p2align 4, 0x0
iree_hal_executable_library_query_v0_stage_location_tables:
.long 0
.zero 4
.quad iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32_stage_names
.quad iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32_stage_source_locations
.long 0
.zero 4
.quad iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack_stage_names
.quad iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack_stage_source_locations
.long 0
.zero 4
.quad iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32_stage_names
.quad iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32_stage_source_locations
.long 0
.zero 4
.quad iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32_stage_names
.quad iree_hal_executable_library_query_v0_turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32_stage_source_locations
.size iree_hal_executable_library_query_v0_stage_location_tables, 96
.type iree_hal_executable_library_query_v0,@object
.section .data.rel.ro.iree_hal_executable_library_query_v0,"aw",@progbits
.p2align 4, 0x0
iree_hal_executable_library_query_v0:
.quad iree_hal_executable_library_query_v0_header
.zero 16
.long 4
.zero 4
.quad iree_hal_executable_library_query_v0_funcs
.quad iree_hal_executable_library_query_v0_attrs
.quad iree_hal_executable_library_query_v0_names
.quad 0
.quad iree_hal_executable_library_query_v0_source_locations
.quad iree_hal_executable_library_query_v0_stage_location_tables
.zero 4
.zero 4
.zero 16
.size iree_hal_executable_library_query_v0, 104
.type __powf_log2_data,@object
.section .rodata.__powf_log2_data,"a",@progbits
.p2align 3, 0x0
__powf_log2_data:
.quad 0x3ff661ec79f8f3be
.quad 0xbfdefec65b963019
.quad 0x3ff571ed4aaf883d
.quad 0xbfdb0b6832d4fca4
.quad 0x3ff49539f0f010b0
.quad 0xbfd7418b0a1fb77b
.quad 0x3ff3c995b0b80385
.quad 0xbfd39de91a6dcf7b
.quad 0x3ff30d190c8864a5
.quad 0xbfd01d9bf3f2b631
.quad 0x3ff25e227b0b8ea0
.quad 0xbfc97c1d1b3b7af0
.quad 0x3ff1bb4a4a1a343f
.quad 0xbfc2f9e393af3c9f
.quad 0x3ff12358f08ae5ba
.quad 0xbfb960cbbf788d5c
.quad 0x3ff0953f419900a7
.quad 0xbfaa6f9db6475fce
.quad 0x3ff0000000000000
.quad 0x0000000000000000
.quad 0x3fee608cfd9a47ac
.quad 0x3fb338ca9f24f53d
.quad 0x3feca4b31f026aa0
.quad 0x3fc476a9543891ba
.quad 0x3feb2036576afce6
.quad 0x3fce840b4ac4e4d2
.quad 0x3fe9c2d163a1aa2d
.quad 0x3fd40645f0c6651c
.quad 0x3fe886e6037841ed
.quad 0x3fd88e9c2c1b9ff8
.quad 0x3fe767dcf5534862
.quad 0x3fdce0a44eb17bcc
.quad 0x3fd27616c9496e0b
.quad 0xbfd71969a075c67a
.quad 0x3fdec70a6ca7badd
.quad 0xbfe7154748bef6c8
.quad 0x3ff71547652ab82b
.size __powf_log2_data, 296
.type __exp2f_data,@object
.section .rodata.__exp2f_data,"a",@progbits
.p2align 3, 0x0
__exp2f_data:
.quad 4607182418800017408
.quad 4607140297302181236
.quad 4607100335213349135
.quad 4607062579818421073
.quad 4607027079437701499
.quad 4606993883449571754
.quad 4606963042313658936
.quad 4606934607594512097
.quad 4606908631985796885
.quad 4606885169335019979
.quad 4606864274668794914
.quad 4606846004218661165
.quad 4606830415447468583
.quad 4606817567076339586
.quad 4606807519112221737
.quad 4606800332876043653
.quad 4606796071031487437
.quad 4606794797614391156
.quad 4606796578062795143
.quad 4606801479247646227
.quad 4606809569504174299
.quad 4606820918663955941
.quad 4606835598087680144
.quad 4606853680698631517
.quad 4606875241016906669
.quad 4606900355194379847
.quad 4606929101050434204
.quad 4606961558108475497
.quad 4606997807633245319
.quad 4607037932668951391
.quad 4607082018078232794
.quad 4607130150581978432
.quad 0x42e8000000000000
.quad 0x3fac6af84b912394
.quad 0x3fcebfce50fac4f3
.quad 0x3fe62e42ff0c52d6
.quad 0x4338000000000000
.quad 0x40471547652b82fe
.quad 0x3ebc6af84b912394
.quad 0x3f2ebfce50fac4f3
.quad 0x3f962e42ff0c52d6
.size __exp2f_data, 328
.section .debug_abbrev,"",@progbits
.byte 1
.byte 17
.byte 1
.byte 37
.byte 14
.byte 19
.byte 5
.byte 3
.byte 14
.byte 16
.byte 23
.ascii "\264B"
.byte 25
.byte 17
.byte 1
.byte 18
.byte 6
.byte 0
.byte 0
.byte 2
.byte 46
.byte 0
.byte 17
.byte 1
.byte 18
.byte 6
.byte 64
.byte 24
.byte 110
.byte 14
.byte 3
.byte 14
.byte 58
.byte 11
.byte 59
.byte 11
.byte 73
.byte 19
.byte 63
.byte 25
.byte 0
.byte 0
.byte 3
.byte 36
.byte 0
.byte 3
.byte 14
.byte 62
.byte 11
.byte 11
.byte 11
.byte 0
.byte 0
.byte 4
.byte 46
.byte 0
.byte 17
.byte 1
.byte 18
.byte 6
.byte 64
.byte 24
.byte 110
.byte 14
.byte 3
.byte 14
.byte 58
.byte 11
.byte 59
.byte 11
.byte 73
.byte 16
.byte 63
.byte 25
.byte 0
.byte 0
.byte 0
.section .debug_info,"",@progbits
.Lcu_begin0:
.long .Ldebug_info_end0-.Ldebug_info_start0
.Ldebug_info_start0:
.short 4
.long .debug_abbrev
.byte 8
.byte 1
.long .Linfo_string0
.short 44
.long .Linfo_string1
.long .Lline_table_start0
.quad .Lfunc_begin0
.long .Lfunc_end0-.Lfunc_begin0
.byte 2
.quad .Lfunc_begin0
.long .Lfunc_end0-.Lfunc_begin0
.byte 1
.byte 86
.long .Linfo_string2
.long .Linfo_string2
.byte 1
.byte 1
.long 67
.byte 3
.long .Linfo_string3
.byte 5
.byte 4
.byte 0
.Ldebug_info_end0:
.Lcu_begin1:
.long .Ldebug_info_end1-.Ldebug_info_start1
.Ldebug_info_start1:
.short 4
.long .debug_abbrev
.byte 8
.byte 1
.long .Linfo_string0
.short 44
.long .Linfo_string1
.long .Lline_table_start0
.quad .Lfunc_begin1
.long .Lfunc_end1-.Lfunc_begin1
.byte 4
.quad .Lfunc_begin1
.long .Lfunc_end1-.Lfunc_begin1
.byte 1
.byte 86
.long .Linfo_string4
.long .Linfo_string4
.byte 1
.byte 1
.long .debug_info+67
.byte 0
.Ldebug_info_end1:
.Lcu_begin2:
.long .Ldebug_info_end2-.Ldebug_info_start2
.Ldebug_info_start2:
.short 4
.long .debug_abbrev
.byte 8
.byte 1
.long .Linfo_string0
.short 44
.long .Linfo_string1
.long .Lline_table_start0
.quad .Lfunc_begin2
.long .Lfunc_end2-.Lfunc_begin2
.byte 4
.quad .Lfunc_begin2
.long .Lfunc_end2-.Lfunc_begin2
.byte 1
.byte 86
.long .Linfo_string5
.long .Linfo_string5
.byte 1
.byte 1
.long .debug_info+67
.byte 0
.Ldebug_info_end2:
.Lcu_begin3:
.long .Ldebug_info_end3-.Ldebug_info_start3
.Ldebug_info_start3:
.short 4
.long .debug_abbrev
.byte 8
.byte 1
.long .Linfo_string0
.short 44
.long .Linfo_string1
.long .Lline_table_start0
.quad .Lfunc_begin3
.long .Lfunc_end3-.Lfunc_begin3
.byte 4
.quad .Lfunc_begin3
.long .Lfunc_end3-.Lfunc_begin3
.byte 1
.byte 86
.long .Linfo_string6
.long .Linfo_string6
.byte 1
.byte 1
.long .debug_info+67
.byte 0
.Ldebug_info_end3:
.section .debug_str,"MS",@progbits,1
.Linfo_string0:
.asciz "IREE"
.Linfo_string1:
.asciz "-"
.Linfo_string2:
.asciz "turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32"
.Linfo_string3:
.asciz "int"
.Linfo_string4:
.asciz "turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack"
.Linfo_string5:
.asciz "turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32"
.Linfo_string6:
.asciz "turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32"
.section .debug_pubnames,"",@progbits
.long .LpubNames_end0-.LpubNames_start0
.LpubNames_start0:
.short 2
.long .Lcu_begin0
.long 75
.long 38
.asciz "turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32"
.long 0
.LpubNames_end0:
.section .debug_pubtypes,"",@progbits
.long .LpubTypes_end0-.LpubTypes_start0
.LpubTypes_start0:
.short 2
.long .Lcu_begin0
.long 75
.long 67
.asciz "int"
.long 0
.LpubTypes_end0:
.section .debug_pubnames,"",@progbits
.long .LpubNames_end1-.LpubNames_start1
.LpubNames_start1:
.short 2
.long .Lcu_begin1
.long 68
.long 38
.asciz "turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack"
.long 0
.LpubNames_end1:
.section .debug_pubtypes,"",@progbits
.long .LpubTypes_end1-.LpubTypes_start1
.LpubTypes_start1:
.short 2
.long .Lcu_begin1
.long 68
.long 0
.LpubTypes_end1:
.section .debug_pubnames,"",@progbits
.long .LpubNames_end2-.LpubNames_start2
.LpubNames_start2:
.short 2
.long .Lcu_begin2
.long 68
.long 38
.asciz "turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32"
.long 0
.LpubNames_end2:
.section .debug_pubtypes,"",@progbits
.long .LpubTypes_end2-.LpubTypes_start2
.LpubTypes_start2:
.short 2
.long .Lcu_begin2
.long 68
.long 0
.LpubTypes_end2:
.section .debug_pubnames,"",@progbits
.long .LpubNames_end3-.LpubNames_start3
.LpubNames_start3:
.short 2
.long .Lcu_begin3
.long 68
.long 38
.asciz "turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32"
.long 0
.LpubNames_end3:
.section .debug_pubtypes,"",@progbits
.long .LpubTypes_end3-.LpubTypes_start3
.LpubTypes_start3:
.short 2
.long .Lcu_begin3
.long 68
.long 0
.LpubTypes_end3:
.section ".note.GNU-stack","",@progbits
.section .debug_line,"",@progbits
.Lline_table_start0:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment