Skip to content

Instantly share code, notes, and snippets.

@AmosLewis
Created February 10, 2025 03:34
Show Gist options
  • Save AmosLewis/3860a0371236b528b24c86c48e1e31c2 to your computer and use it in GitHub Desktop.
Save AmosLewis/3860a0371236b528b24c86c48e1e31c2 to your computer and use it in GitHub Desktop.
(.venv) ➜ shark-ai git:(users/dan-garvey/enable_custom_fp8_matmul) ✗ python -m sharktank.examples.paged_llm_v1 --irpa-file=/home/chi/src/test/llama/dan/fp8.irpa --tokenizer-config-json=/home/chi/src/test/llama/dan/tokenizer.json --dump-bins "t"
/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/turbine/aot/params.py:163: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)
return torch.from_numpy(wrapper)
:: Prompting:
b't'
:: Prompt tokens: tensor([[83, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
:: Invoke prefill:
failed to translate executables
<unknown>:0: error: 'llvm.fpext' op operand #0 must be floating point LLVM type or LLVM dialect-compatible vector of floating point LLVM type, but got 'vector<16xi8>'
<unknown>:0: note: see current operation: %292 = "llvm.fpext"(%281) : (vector<16xi8>) -> vector<16xf32>
source.mlir:16:10: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
%2 = linalg.batch_matmul_transpose_b ins(%arg0, %arg1 : tensor<1x32x4096xf8E4M3FNUZ>, tensor<1x4096x4096xf8E4M3FNUZ>) outs(%1 : tensor<1x32x4096xf32>) -> tensor<1x32x4096xf32>
^
source.mlir:2:3: note: called from
func.func @main(%arg0: tensor<1x32x4096xf8E4M3FNUZ>, %arg1: tensor<1x4096x4096xf8E4M3FNUZ>) -> tensor<1x32x4096xf32> {
^
source.mlir:16:10: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg3: !hal.device):
%567 = "arith.constant"() <{value = 256 : index}> : () -> index
%568 = "arith.constant"() <{value = 2 : index}> : () -> index
%569 = "arith.constant"() <{value = 1 : index}> : () -> index
"hal.return"(%567, %568, %569) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "main_dispatch_2_batch_mmt4d_1x2x256x4096x16x16x1_f8E4M3FNUZxf8E4M3FNUZxf32", workgroup_size = [1 : index, 1 : index, 1 : index]} : () -> ()
"builtin.module"() ({
"llvm.func"() <{CConv = #llvm.cconv<ccc>, arg_attrs = [{llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}], function_type = !llvm.func<i32 (ptr, ptr, ptr)>, linkage = #llvm.linkage<external>, sym_name = "main_dispatch_2_batch_mmt4d_1x2x256x4096x16x16x1_f8E4M3FNUZxf8E4M3FNUZxf32", visibility_ = 0 : i64}> ({
^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr):
%0 = "llvm.mlir.constant"() <{value = 15 : i64}> : () -> i64
%1 = "llvm.mlir.constant"() <{value = 14 : i64}> : () -> i64
%2 = "llvm.mlir.constant"() <{value = 13 : i64}> : () -> i64
%3 = "llvm.mlir.constant"() <{value = 12 : i64}> : () -> i64
%4 = "llvm.mlir.constant"() <{value = 11 : i64}> : () -> i64
%5 = "llvm.mlir.constant"() <{value = 10 : i64}> : () -> i64
%6 = "llvm.mlir.constant"() <{value = 9 : i64}> : () -> i64
%7 = "llvm.mlir.constant"() <{value = 8 : i64}> : () -> i64
%8 = "llvm.mlir.constant"() <{value = 7 : i64}> : () -> i64
%9 = "llvm.mlir.constant"() <{value = 6 : i64}> : () -> i64
%10 = "llvm.mlir.constant"() <{value = 5 : i64}> : () -> i64
%11 = "llvm.mlir.constant"() <{value = 4 : i64}> : () -> i64
%12 = "llvm.mlir.constant"() <{value = 3 : i64}> : () -> i64
%13 = "llvm.mlir.constant"() <{value = 2 : i64}> : () -> i64
%14 = "llvm.mlir.constant"() <{value = 1 : i64}> : () -> i64
%15 = "llvm.mlir.constant"() <{value = 0 : i32}> : () -> i32
%16 = "llvm.mlir.undef"() : () -> vector<16xf32>
%17 = "llvm.mlir.constant"() <{value = 0 : i64}> : () -> i64
%18 = "llvm.mlir.constant"() <{value = 256 : index}> : () -> i64
%19 = "llvm.mlir.constant"() <{value = 16777216 : index}> : () -> i64
%20 = "llvm.mlir.constant"() <{value = 64 : index}> : () -> i64
%21 = "llvm.mlir.constant"() <{value = true}> : () -> i1
%22 = "llvm.mlir.constant"() <{value = 65536 : index}> : () -> i64
%23 = "llvm.mlir.constant"() <{value = 15 : index}> : () -> i64
%24 = "llvm.mlir.constant"() <{value = 14 : index}> : () -> i64
%25 = "llvm.mlir.constant"() <{value = 13 : index}> : () -> i64
%26 = "llvm.mlir.constant"() <{value = 12 : index}> : () -> i64
%27 = "llvm.mlir.constant"() <{value = 11 : index}> : () -> i64
%28 = "llvm.mlir.constant"() <{value = 10 : index}> : () -> i64
%29 = "llvm.mlir.constant"() <{value = 9 : index}> : () -> i64
%30 = "llvm.mlir.constant"() <{value = 8 : index}> : () -> i64
%31 = "llvm.mlir.constant"() <{value = 7 : index}> : () -> i64
%32 = "llvm.mlir.constant"() <{value = 6 : index}> : () -> i64
%33 = "llvm.mlir.constant"() <{value = 5 : index}> : () -> i64
%34 = "llvm.mlir.constant"() <{value = 4 : index}> : () -> i64
%35 = "llvm.mlir.constant"() <{value = 3 : index}> : () -> i64
%36 = "llvm.mlir.constant"() <{value = 2 : index}> : () -> i64
%37 = "llvm.mlir.constant"() <{value = 16 : index}> : () -> i64
%38 = "llvm.mlir.constant"() <{value = dense<0.000000e+00> : vector<16x16xf32>}> : () -> !llvm.array<16 x vector<16xf32>>
%39 = "llvm.mlir.constant"() <{value = 1 : index}> : () -> i64
%40 = "llvm.mlir.constant"() <{value = 4096 : index}> : () -> i64
%41 = "llvm.mlir.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
%42 = "llvm.mlir.constant"() <{value = 0 : index}> : () -> i64
%43 = "llvm.mlir.constant"() <{value = 131072 : index}> : () -> i64
%44 = "llvm.load"(%arg1) <{ordering = 0 : i64}> : (!llvm.ptr) -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%45 = "llvm.extractvalue"(%44) <{position = array<i64: 10>}> : (!llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>) -> !llvm.ptr
%46 = "llvm.load"(%45) <{ordering = 0 : i64}> : (!llvm.ptr) -> !llvm.ptr
"llvm.intr.assume"(%21, %46, %20) <{op_bundle_sizes = array<i32: 2>, op_bundle_tags = ["align"]}> : (i1, !llvm.ptr, i64) -> ()
%47 = "llvm.load"(%arg1) <{ordering = 0 : i64}> : (!llvm.ptr) -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%48 = "llvm.extractvalue"(%47) <{position = array<i64: 10>}> : (!llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>) -> !llvm.ptr
%49 = "llvm.load"(%48) <{ordering = 0 : i64}> : (!llvm.ptr) -> !llvm.ptr
%50 = "llvm.getelementptr"(%49) <{elem_type = i8, rawConstantIndices = array<i32: 131072>}> : (!llvm.ptr) -> !llvm.ptr
"llvm.intr.assume"(%21, %50, %20) <{op_bundle_sizes = array<i32: 2>, op_bundle_tags = ["align"]}> : (i1, !llvm.ptr, i64) -> ()
%51 = "llvm.load"(%arg1) <{ordering = 0 : i64}> : (!llvm.ptr) -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%52 = "llvm.extractvalue"(%51) <{position = array<i64: 10>}> : (!llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>) -> !llvm.ptr
%53 = "llvm.getelementptr"(%52) <{elem_type = !llvm.ptr, rawConstantIndices = array<i32: 1>}> : (!llvm.ptr) -> !llvm.ptr
%54 = "llvm.load"(%53) <{ordering = 0 : i64}> : (!llvm.ptr) -> !llvm.ptr
%55 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
"llvm.intr.assume"(%21, %55, %20) <{op_bundle_sizes = array<i32: 2>, op_bundle_tags = ["align"]}> : (i1, !llvm.ptr, i64) -> ()
%56 = "llvm.load"(%arg2) <{ordering = 0 : i64}> : (!llvm.ptr) -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
%57 = "llvm.extractvalue"(%56) <{position = array<i64: 0>}> : (!llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>) -> i32
%58 = "llvm.zext"(%57) : (i32) -> i64
%59 = "llvm.load"(%arg2) <{ordering = 0 : i64}> : (!llvm.ptr) -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
%60 = "llvm.extractvalue"(%59) <{position = array<i64: 1>}> : (!llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>) -> i32
%61 = "llvm.zext"(%60) : (i32) -> i64
"llvm.br"(%42)[^bb1] : (i64) -> ()
^bb1(%62: i64): // 2 preds: ^bb0, ^bb4
%63 = "llvm.icmp"(%62, %37) <{predicate = 2 : i64}> : (i64, i64) -> i1
"llvm.cond_br"(%63, %42)[^bb2, ^bb5] <{operandSegmentSizes = array<i32: 1, 1, 0>}> : (i1, i64) -> ()
^bb2(%64: i64): // 2 preds: ^bb1, ^bb3
%65 = "llvm.icmp"(%64, %37) <{predicate = 2 : i64}> : (i64, i64) -> i1
"llvm.cond_br"(%65)[^bb3, ^bb4] <{operandSegmentSizes = array<i32: 1, 0, 0>}> : (i1) -> ()
^bb3: // pred: ^bb2
%66 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%67 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%68 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%69 = "llvm.add"(%67, %68) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%70 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%71 = "llvm.add"(%69, %70) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%72 = "llvm.mul"(%62, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%73 = "llvm.add"(%71, %72) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%74 = "llvm.add"(%73, %64) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%75 = "llvm.getelementptr"(%66, %74) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
"llvm.store"(%41, %75) <{ordering = 0 : i64}> : (f32, !llvm.ptr) -> ()
%76 = "llvm.add"(%64, %39) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
"llvm.br"(%76)[^bb2] : (i64) -> ()
^bb4: // pred: ^bb2
%77 = "llvm.add"(%62, %39) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
"llvm.br"(%77)[^bb1] : (i64) -> ()
^bb5: // pred: ^bb1
%78 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%79 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%80 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%81 = "llvm.add"(%79, %80) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%82 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%83 = "llvm.add"(%81, %82) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%84 = "llvm.mul"(%42, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%85 = "llvm.add"(%83, %84) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%86 = "llvm.add"(%85, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%87 = "llvm.getelementptr"(%78, %86) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%88 = "llvm.load"(%87) <{alignment = 4 : i64, ordering = 0 : i64}> : (!llvm.ptr) -> vector<16xf32>
%89 = "llvm.insertvalue"(%38, %88) <{position = array<i64: 0>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%90 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%91 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%92 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%93 = "llvm.add"(%91, %92) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%94 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%95 = "llvm.add"(%93, %94) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%96 = "llvm.mul"(%39, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%97 = "llvm.add"(%95, %96) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%98 = "llvm.add"(%97, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%99 = "llvm.getelementptr"(%90, %98) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%100 = "llvm.load"(%99) <{alignment = 4 : i64, ordering = 0 : i64}> : (!llvm.ptr) -> vector<16xf32>
%101 = "llvm.insertvalue"(%89, %100) <{position = array<i64: 1>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%102 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%103 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%104 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%105 = "llvm.add"(%103, %104) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%106 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%107 = "llvm.add"(%105, %106) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%108 = "llvm.mul"(%36, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%109 = "llvm.add"(%107, %108) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%110 = "llvm.add"(%109, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%111 = "llvm.getelementptr"(%102, %110) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%112 = "llvm.load"(%111) <{alignment = 4 : i64, ordering = 0 : i64}> : (!llvm.ptr) -> vector<16xf32>
%113 = "llvm.insertvalue"(%101, %112) <{position = array<i64: 2>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%114 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%115 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%116 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%117 = "llvm.add"(%115, %116) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%118 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%119 = "llvm.add"(%117, %118) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%120 = "llvm.mul"(%35, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%121 = "llvm.add"(%119, %120) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%122 = "llvm.add"(%121, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%123 = "llvm.getelementptr"(%114, %122) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%124 = "llvm.load"(%123) <{alignment = 4 : i64, ordering = 0 : i64}> : (!llvm.ptr) -> vector<16xf32>
%125 = "llvm.insertvalue"(%113, %124) <{position = array<i64: 3>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%126 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%127 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%128 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%129 = "llvm.add"(%127, %128) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%130 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%131 = "llvm.add"(%129, %130) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%132 = "llvm.mul"(%34, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%133 = "llvm.add"(%131, %132) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%134 = "llvm.add"(%133, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%135 = "llvm.getelementptr"(%126, %134) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%136 = "llvm.load"(%135) <{alignment = 4 : i64, ordering = 0 : i64}> : (!llvm.ptr) -> vector<16xf32>
%137 = "llvm.insertvalue"(%125, %136) <{position = array<i64: 4>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%138 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%139 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%140 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%141 = "llvm.add"(%139, %140) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%142 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%143 = "llvm.add"(%141, %142) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%144 = "llvm.mul"(%33, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%145 = "llvm.add"(%143, %144) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%146 = "llvm.add"(%145, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%147 = "llvm.getelementptr"(%138, %146) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%148 = "llvm.load"(%147) <{alignment = 4 : i64, ordering = 0 : i64}> : (!llvm.ptr) -> vector<16xf32>
%149 = "llvm.insertvalue"(%137, %148) <{position = array<i64: 5>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%150 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%151 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%152 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%153 = "llvm.add"(%151, %152) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%154 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%155 = "llvm.add"(%153, %154) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%156 = "llvm.mul"(%32, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%157 = "llvm.add"(%155, %156) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%158 = "llvm.add"(%157, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%159 = "llvm.getelementptr"(%150, %158) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%160 = "llvm.load"(%159) <{alignment = 4 : i64, ordering = 0 : i64}> : (!llvm.ptr) -> vector<16xf32>
%161 = "llvm.insertvalue"(%149, %160) <{position = array<i64: 6>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%162 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%163 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%164 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%165 = "llvm.add"(%163, %164) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%166 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%167 = "llvm.add"(%165, %166) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%168 = "llvm.mul"(%31, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%169 = "llvm.add"(%167, %168) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%170 = "llvm.add"(%169, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%171 = "llvm.getelementptr"(%162, %170) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%172 = "llvm.load"(%171) <{alignment = 4 : i64, ordering = 0 : i64}> : (!llvm.ptr) -> vector<16xf32>
%173 = "llvm.insertvalue"(%161, %172) <{position = array<i64: 7>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%174 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%175 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%176 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%177 = "llvm.add"(%175, %176) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%178 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%179 = "llvm.add"(%177, %178) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%180 = "llvm.mul"(%30, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%181 = "llvm.add"(%179, %180) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%182 = "llvm.add"(%181, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%183 = "llvm.getelementptr"(%174, %182) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%184 = "llvm.load"(%183) <{alignment = 4 : i64, ordering = 0 : i64}> : (!llvm.ptr) -> vector<16xf32>
%185 = "llvm.insertvalue"(%173, %184) <{position = array<i64: 8>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%186 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%187 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%188 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%189 = "llvm.add"(%187, %188) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%190 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%191 = "llvm.add"(%189, %190) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%192 = "llvm.mul"(%29, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%193 = "llvm.add"(%191, %192) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%194 = "llvm.add"(%193, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%195 = "llvm.getelementptr"(%186, %194) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%196 = "llvm.load"(%195) <{alignment = 4 : i64, ordering = 0 : i64}> : (!llvm.ptr) -> vector<16xf32>
%197 = "llvm.insertvalue"(%185, %196) <{position = array<i64: 9>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%198 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%199 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%200 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%201 = "llvm.add"(%199, %200) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%202 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%203 = "llvm.add"(%201, %202) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%204 = "llvm.mul"(%28, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%205 = "llvm.add"(%203, %204) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%206 = "llvm.add"(%205, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%207 = "llvm.getelementptr"(%198, %206) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%208 = "llvm.load"(%207) <{alignment = 4 : i64, ordering = 0 : i64}> : (!llvm.ptr) -> vector<16xf32>
%209 = "llvm.insertvalue"(%197, %208) <{position = array<i64: 10>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%210 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%211 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%212 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%213 = "llvm.add"(%211, %212) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%214 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%215 = "llvm.add"(%213, %214) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%216 = "llvm.mul"(%27, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%217 = "llvm.add"(%215, %216) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%218 = "llvm.add"(%217, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%219 = "llvm.getelementptr"(%210, %218) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%220 = "llvm.load"(%219) <{alignment = 4 : i64, ordering = 0 : i64}> : (!llvm.ptr) -> vector<16xf32>
%221 = "llvm.insertvalue"(%209, %220) <{position = array<i64: 11>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%222 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%223 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%224 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%225 = "llvm.add"(%223, %224) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%226 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%227 = "llvm.add"(%225, %226) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%228 = "llvm.mul"(%26, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%229 = "llvm.add"(%227, %228) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%230 = "llvm.add"(%229, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%231 = "llvm.getelementptr"(%222, %230) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%232 = "llvm.load"(%231) <{alignment = 4 : i64, ordering = 0 : i64}> : (!llvm.ptr) -> vector<16xf32>
%233 = "llvm.insertvalue"(%221, %232) <{position = array<i64: 12>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%234 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%235 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%236 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%237 = "llvm.add"(%235, %236) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%238 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%239 = "llvm.add"(%237, %238) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%240 = "llvm.mul"(%25, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%241 = "llvm.add"(%239, %240) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%242 = "llvm.add"(%241, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%243 = "llvm.getelementptr"(%234, %242) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%244 = "llvm.load"(%243) <{alignment = 4 : i64, ordering = 0 : i64}> : (!llvm.ptr) -> vector<16xf32>
%245 = "llvm.insertvalue"(%233, %244) <{position = array<i64: 13>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%246 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%247 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%248 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%249 = "llvm.add"(%247, %248) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%250 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%251 = "llvm.add"(%249, %250) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%252 = "llvm.mul"(%24, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%253 = "llvm.add"(%251, %252) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%254 = "llvm.add"(%253, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%255 = "llvm.getelementptr"(%246, %254) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%256 = "llvm.load"(%255) <{alignment = 4 : i64, ordering = 0 : i64}> : (!llvm.ptr) -> vector<16xf32>
%257 = "llvm.insertvalue"(%245, %256) <{position = array<i64: 14>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%258 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%259 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%260 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%261 = "llvm.add"(%259, %260) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%262 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%263 = "llvm.add"(%261, %262) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%264 = "llvm.mul"(%23, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%265 = "llvm.add"(%263, %264) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%266 = "llvm.add"(%265, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%267 = "llvm.getelementptr"(%258, %266) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%268 = "llvm.load"(%267) <{alignment = 4 : i64, ordering = 0 : i64}> : (!llvm.ptr) -> vector<16xf32>
%269 = "llvm.insertvalue"(%257, %268) <{position = array<i64: 15>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
"llvm.br"(%42, %269)[^bb6] : (i64, !llvm.array<16 x vector<16xf32>>) -> ()
^bb6(%270: i64, %271: !llvm.array<16 x vector<16xf32>>): // 2 preds: ^bb5, ^bb7
%272 = "llvm.icmp"(%270, %40) <{predicate = 2 : i64}> : (i64, i64) -> i1
"llvm.cond_br"(%272)[^bb7, ^bb8] <{operandSegmentSizes = array<i32: 1, 0, 0>}> : (i1) -> ()
^bb7: // pred: ^bb6
%273 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%274 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%275 = "llvm.add"(%273, %274) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%276 = "llvm.mul"(%270, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%277 = "llvm.add"(%275, %276) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%278 = "llvm.add"(%277, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%279 = "llvm.add"(%278, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%280 = "llvm.getelementptr"(%46, %279) <{elem_type = i8, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%281 = "llvm.load"(%280) <{alignment = 1 : i64, ordering = 0 : i64}> : (!llvm.ptr) -> vector<16xi8>
%282 = "llvm.getelementptr"(%49) <{elem_type = i8, rawConstantIndices = array<i32: 131072>}> : (!llvm.ptr) -> !llvm.ptr
%283 = "llvm.mul"(%42, %19) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%284 = "llvm.mul"(%58, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%285 = "llvm.add"(%283, %284) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%286 = "llvm.mul"(%270, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%287 = "llvm.add"(%285, %286) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%288 = "llvm.add"(%287, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%289 = "llvm.add"(%288, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%290 = "llvm.getelementptr"(%282, %289) <{elem_type = i8, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%291 = "llvm.load"(%290) <{alignment = 1 : i64, ordering = 0 : i64}> : (!llvm.ptr) -> vector<16xi8>
%292 = "llvm.fpext"(%281) : (vector<16xi8>) -> vector<16xf32>
%293 = "llvm.fpext"(%291) : (vector<16xi8>) -> vector<16xf32>
%294 = "llvm.extractelement"(%292, %17) : (vector<16xf32>, i64) -> f32
%295 = "llvm.insertelement"(%16, %294, %15) : (vector<16xf32>, f32, i32) -> vector<16xf32>
%296 = "llvm.shufflevector"(%295, %16) <{mask = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>}> : (vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%297 = "llvm.extractvalue"(%271) <{position = array<i64: 0>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%298 = "llvm.intr.fmuladd"(%296, %293, %297) <{fastmathFlags = #llvm.fastmath<none>}> : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%299 = "llvm.insertvalue"(%38, %298) <{position = array<i64: 0>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%300 = "llvm.extractelement"(%292, %14) : (vector<16xf32>, i64) -> f32
%301 = "llvm.insertelement"(%16, %300, %15) : (vector<16xf32>, f32, i32) -> vector<16xf32>
%302 = "llvm.shufflevector"(%301, %16) <{mask = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>}> : (vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%303 = "llvm.extractvalue"(%271) <{position = array<i64: 1>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%304 = "llvm.intr.fmuladd"(%302, %293, %303) <{fastmathFlags = #llvm.fastmath<none>}> : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%305 = "llvm.insertvalue"(%299, %304) <{position = array<i64: 1>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%306 = "llvm.extractelement"(%292, %13) : (vector<16xf32>, i64) -> f32
%307 = "llvm.insertelement"(%16, %306, %15) : (vector<16xf32>, f32, i32) -> vector<16xf32>
%308 = "llvm.shufflevector"(%307, %16) <{mask = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>}> : (vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%309 = "llvm.extractvalue"(%271) <{position = array<i64: 2>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%310 = "llvm.intr.fmuladd"(%308, %293, %309) <{fastmathFlags = #llvm.fastmath<none>}> : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%311 = "llvm.insertvalue"(%305, %310) <{position = array<i64: 2>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%312 = "llvm.extractelement"(%292, %12) : (vector<16xf32>, i64) -> f32
%313 = "llvm.insertelement"(%16, %312, %15) : (vector<16xf32>, f32, i32) -> vector<16xf32>
%314 = "llvm.shufflevector"(%313, %16) <{mask = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>}> : (vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%315 = "llvm.extractvalue"(%271) <{position = array<i64: 3>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%316 = "llvm.intr.fmuladd"(%314, %293, %315) <{fastmathFlags = #llvm.fastmath<none>}> : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%317 = "llvm.insertvalue"(%311, %316) <{position = array<i64: 3>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%318 = "llvm.extractelement"(%292, %11) : (vector<16xf32>, i64) -> f32
%319 = "llvm.insertelement"(%16, %318, %15) : (vector<16xf32>, f32, i32) -> vector<16xf32>
%320 = "llvm.shufflevector"(%319, %16) <{mask = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>}> : (vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%321 = "llvm.extractvalue"(%271) <{position = array<i64: 4>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%322 = "llvm.intr.fmuladd"(%320, %293, %321) <{fastmathFlags = #llvm.fastmath<none>}> : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%323 = "llvm.insertvalue"(%317, %322) <{position = array<i64: 4>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%324 = "llvm.extractelement"(%292, %10) : (vector<16xf32>, i64) -> f32
%325 = "llvm.insertelement"(%16, %324, %15) : (vector<16xf32>, f32, i32) -> vector<16xf32>
%326 = "llvm.shufflevector"(%325, %16) <{mask = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>}> : (vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%327 = "llvm.extractvalue"(%271) <{position = array<i64: 5>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%328 = "llvm.intr.fmuladd"(%326, %293, %327) <{fastmathFlags = #llvm.fastmath<none>}> : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%329 = "llvm.insertvalue"(%323, %328) <{position = array<i64: 5>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%330 = "llvm.extractelement"(%292, %9) : (vector<16xf32>, i64) -> f32
%331 = "llvm.insertelement"(%16, %330, %15) : (vector<16xf32>, f32, i32) -> vector<16xf32>
%332 = "llvm.shufflevector"(%331, %16) <{mask = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>}> : (vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%333 = "llvm.extractvalue"(%271) <{position = array<i64: 6>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%334 = "llvm.intr.fmuladd"(%332, %293, %333) <{fastmathFlags = #llvm.fastmath<none>}> : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%335 = "llvm.insertvalue"(%329, %334) <{position = array<i64: 6>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%336 = "llvm.extractelement"(%292, %8) : (vector<16xf32>, i64) -> f32
%337 = "llvm.insertelement"(%16, %336, %15) : (vector<16xf32>, f32, i32) -> vector<16xf32>
%338 = "llvm.shufflevector"(%337, %16) <{mask = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>}> : (vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%339 = "llvm.extractvalue"(%271) <{position = array<i64: 7>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%340 = "llvm.intr.fmuladd"(%338, %293, %339) <{fastmathFlags = #llvm.fastmath<none>}> : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%341 = "llvm.insertvalue"(%335, %340) <{position = array<i64: 7>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%342 = "llvm.extractelement"(%292, %7) : (vector<16xf32>, i64) -> f32
%343 = "llvm.insertelement"(%16, %342, %15) : (vector<16xf32>, f32, i32) -> vector<16xf32>
%344 = "llvm.shufflevector"(%343, %16) <{mask = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>}> : (vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%345 = "llvm.extractvalue"(%271) <{position = array<i64: 8>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%346 = "llvm.intr.fmuladd"(%344, %293, %345) <{fastmathFlags = #llvm.fastmath<none>}> : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%347 = "llvm.insertvalue"(%341, %346) <{position = array<i64: 8>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%348 = "llvm.extractelement"(%292, %6) : (vector<16xf32>, i64) -> f32
%349 = "llvm.insertelement"(%16, %348, %15) : (vector<16xf32>, f32, i32) -> vector<16xf32>
%350 = "llvm.shufflevector"(%349, %16) <{mask = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>}> : (vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%351 = "llvm.extractvalue"(%271) <{position = array<i64: 9>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%352 = "llvm.intr.fmuladd"(%350, %293, %351) <{fastmathFlags = #llvm.fastmath<none>}> : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%353 = "llvm.insertvalue"(%347, %352) <{position = array<i64: 9>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%354 = "llvm.extractelement"(%292, %5) : (vector<16xf32>, i64) -> f32
%355 = "llvm.insertelement"(%16, %354, %15) : (vector<16xf32>, f32, i32) -> vector<16xf32>
%356 = "llvm.shufflevector"(%355, %16) <{mask = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>}> : (vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%357 = "llvm.extractvalue"(%271) <{position = array<i64: 10>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%358 = "llvm.intr.fmuladd"(%356, %293, %357) <{fastmathFlags = #llvm.fastmath<none>}> : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%359 = "llvm.insertvalue"(%353, %358) <{position = array<i64: 10>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%360 = "llvm.extractelement"(%292, %4) : (vector<16xf32>, i64) -> f32
%361 = "llvm.insertelement"(%16, %360, %15) : (vector<16xf32>, f32, i32) -> vector<16xf32>
%362 = "llvm.shufflevector"(%361, %16) <{mask = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>}> : (vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%363 = "llvm.extractvalue"(%271) <{position = array<i64: 11>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%364 = "llvm.intr.fmuladd"(%362, %293, %363) <{fastmathFlags = #llvm.fastmath<none>}> : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%365 = "llvm.insertvalue"(%359, %364) <{position = array<i64: 11>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%366 = "llvm.extractelement"(%292, %3) : (vector<16xf32>, i64) -> f32
%367 = "llvm.insertelement"(%16, %366, %15) : (vector<16xf32>, f32, i32) -> vector<16xf32>
%368 = "llvm.shufflevector"(%367, %16) <{mask = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>}> : (vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%369 = "llvm.extractvalue"(%271) <{position = array<i64: 12>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%370 = "llvm.intr.fmuladd"(%368, %293, %369) <{fastmathFlags = #llvm.fastmath<none>}> : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%371 = "llvm.insertvalue"(%365, %370) <{position = array<i64: 12>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%372 = "llvm.extractelement"(%292, %2) : (vector<16xf32>, i64) -> f32
%373 = "llvm.insertelement"(%16, %372, %15) : (vector<16xf32>, f32, i32) -> vector<16xf32>
%374 = "llvm.shufflevector"(%373, %16) <{mask = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>}> : (vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%375 = "llvm.extractvalue"(%271) <{position = array<i64: 13>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%376 = "llvm.intr.fmuladd"(%374, %293, %375) <{fastmathFlags = #llvm.fastmath<none>}> : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%377 = "llvm.insertvalue"(%371, %376) <{position = array<i64: 13>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%378 = "llvm.extractelement"(%292, %1) : (vector<16xf32>, i64) -> f32
%379 = "llvm.insertelement"(%16, %378, %15) : (vector<16xf32>, f32, i32) -> vector<16xf32>
%380 = "llvm.shufflevector"(%379, %16) <{mask = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>}> : (vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%381 = "llvm.extractvalue"(%271) <{position = array<i64: 14>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%382 = "llvm.intr.fmuladd"(%380, %293, %381) <{fastmathFlags = #llvm.fastmath<none>}> : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%383 = "llvm.insertvalue"(%377, %382) <{position = array<i64: 14>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%384 = "llvm.extractelement"(%292, %0) : (vector<16xf32>, i64) -> f32
%385 = "llvm.insertelement"(%16, %384, %15) : (vector<16xf32>, f32, i32) -> vector<16xf32>
%386 = "llvm.shufflevector"(%385, %16) <{mask = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>}> : (vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%387 = "llvm.extractvalue"(%271) <{position = array<i64: 15>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%388 = "llvm.intr.fmuladd"(%386, %293, %387) <{fastmathFlags = #llvm.fastmath<none>}> : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%389 = "llvm.insertvalue"(%383, %388) <{position = array<i64: 15>}> : (!llvm.array<16 x vector<16xf32>>, vector<16xf32>) -> !llvm.array<16 x vector<16xf32>>
%390 = "llvm.add"(%270, %39) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
"llvm.br"(%390, %389)[^bb6] : (i64, !llvm.array<16 x vector<16xf32>>) -> ()
^bb8: // pred: ^bb6
%391 = "llvm.extractvalue"(%271) <{position = array<i64: 0>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%392 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%393 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%394 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%395 = "llvm.add"(%393, %394) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%396 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%397 = "llvm.add"(%395, %396) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%398 = "llvm.mul"(%42, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%399 = "llvm.add"(%397, %398) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%400 = "llvm.add"(%399, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%401 = "llvm.getelementptr"(%392, %400) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
"llvm.store"(%391, %401) <{alignment = 4 : i64, ordering = 0 : i64}> : (vector<16xf32>, !llvm.ptr) -> ()
%402 = "llvm.extractvalue"(%271) <{position = array<i64: 1>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%403 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%404 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%405 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%406 = "llvm.add"(%404, %405) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%407 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%408 = "llvm.add"(%406, %407) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%409 = "llvm.mul"(%39, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%410 = "llvm.add"(%408, %409) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%411 = "llvm.add"(%410, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%412 = "llvm.getelementptr"(%403, %411) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
"llvm.store"(%402, %412) <{alignment = 4 : i64, ordering = 0 : i64}> : (vector<16xf32>, !llvm.ptr) -> ()
%413 = "llvm.extractvalue"(%271) <{position = array<i64: 2>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%414 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%415 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%416 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%417 = "llvm.add"(%415, %416) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%418 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%419 = "llvm.add"(%417, %418) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%420 = "llvm.mul"(%36, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%421 = "llvm.add"(%419, %420) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%422 = "llvm.add"(%421, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%423 = "llvm.getelementptr"(%414, %422) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
"llvm.store"(%413, %423) <{alignment = 4 : i64, ordering = 0 : i64}> : (vector<16xf32>, !llvm.ptr) -> ()
%424 = "llvm.extractvalue"(%271) <{position = array<i64: 3>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%425 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%426 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%427 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%428 = "llvm.add"(%426, %427) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%429 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%430 = "llvm.add"(%428, %429) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%431 = "llvm.mul"(%35, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%432 = "llvm.add"(%430, %431) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%433 = "llvm.add"(%432, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%434 = "llvm.getelementptr"(%425, %433) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
"llvm.store"(%424, %434) <{alignment = 4 : i64, ordering = 0 : i64}> : (vector<16xf32>, !llvm.ptr) -> ()
%435 = "llvm.extractvalue"(%271) <{position = array<i64: 4>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%436 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%437 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%438 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%439 = "llvm.add"(%437, %438) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%440 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%441 = "llvm.add"(%439, %440) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%442 = "llvm.mul"(%34, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%443 = "llvm.add"(%441, %442) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%444 = "llvm.add"(%443, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%445 = "llvm.getelementptr"(%436, %444) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
"llvm.store"(%435, %445) <{alignment = 4 : i64, ordering = 0 : i64}> : (vector<16xf32>, !llvm.ptr) -> ()
%446 = "llvm.extractvalue"(%271) <{position = array<i64: 5>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%447 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%448 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%449 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%450 = "llvm.add"(%448, %449) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%451 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%452 = "llvm.add"(%450, %451) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%453 = "llvm.mul"(%33, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%454 = "llvm.add"(%452, %453) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%455 = "llvm.add"(%454, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%456 = "llvm.getelementptr"(%447, %455) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
"llvm.store"(%446, %456) <{alignment = 4 : i64, ordering = 0 : i64}> : (vector<16xf32>, !llvm.ptr) -> ()
%457 = "llvm.extractvalue"(%271) <{position = array<i64: 6>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%458 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%459 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%460 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%461 = "llvm.add"(%459, %460) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%462 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%463 = "llvm.add"(%461, %462) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%464 = "llvm.mul"(%32, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%465 = "llvm.add"(%463, %464) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%466 = "llvm.add"(%465, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%467 = "llvm.getelementptr"(%458, %466) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
"llvm.store"(%457, %467) <{alignment = 4 : i64, ordering = 0 : i64}> : (vector<16xf32>, !llvm.ptr) -> ()
%468 = "llvm.extractvalue"(%271) <{position = array<i64: 7>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%469 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%470 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%471 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%472 = "llvm.add"(%470, %471) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%473 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%474 = "llvm.add"(%472, %473) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%475 = "llvm.mul"(%31, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%476 = "llvm.add"(%474, %475) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%477 = "llvm.add"(%476, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%478 = "llvm.getelementptr"(%469, %477) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
"llvm.store"(%468, %478) <{alignment = 4 : i64, ordering = 0 : i64}> : (vector<16xf32>, !llvm.ptr) -> ()
%479 = "llvm.extractvalue"(%271) <{position = array<i64: 8>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%480 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%481 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%482 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%483 = "llvm.add"(%481, %482) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%484 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%485 = "llvm.add"(%483, %484) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%486 = "llvm.mul"(%30, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%487 = "llvm.add"(%485, %486) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%488 = "llvm.add"(%487, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%489 = "llvm.getelementptr"(%480, %488) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
"llvm.store"(%479, %489) <{alignment = 4 : i64, ordering = 0 : i64}> : (vector<16xf32>, !llvm.ptr) -> ()
%490 = "llvm.extractvalue"(%271) <{position = array<i64: 9>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%491 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%492 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%493 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%494 = "llvm.add"(%492, %493) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%495 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%496 = "llvm.add"(%494, %495) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%497 = "llvm.mul"(%29, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%498 = "llvm.add"(%496, %497) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%499 = "llvm.add"(%498, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%500 = "llvm.getelementptr"(%491, %499) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
"llvm.store"(%490, %500) <{alignment = 4 : i64, ordering = 0 : i64}> : (vector<16xf32>, !llvm.ptr) -> ()
%501 = "llvm.extractvalue"(%271) <{position = array<i64: 10>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%502 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%503 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%504 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%505 = "llvm.add"(%503, %504) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%506 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%507 = "llvm.add"(%505, %506) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%508 = "llvm.mul"(%28, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%509 = "llvm.add"(%507, %508) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%510 = "llvm.add"(%509, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%511 = "llvm.getelementptr"(%502, %510) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
"llvm.store"(%501, %511) <{alignment = 4 : i64, ordering = 0 : i64}> : (vector<16xf32>, !llvm.ptr) -> ()
%512 = "llvm.extractvalue"(%271) <{position = array<i64: 11>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%513 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%514 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%515 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%516 = "llvm.add"(%514, %515) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%517 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%518 = "llvm.add"(%516, %517) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%519 = "llvm.mul"(%27, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%520 = "llvm.add"(%518, %519) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%521 = "llvm.add"(%520, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%522 = "llvm.getelementptr"(%513, %521) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
"llvm.store"(%512, %522) <{alignment = 4 : i64, ordering = 0 : i64}> : (vector<16xf32>, !llvm.ptr) -> ()
%523 = "llvm.extractvalue"(%271) <{position = array<i64: 12>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%524 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%525 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%526 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%527 = "llvm.add"(%525, %526) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%528 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%529 = "llvm.add"(%527, %528) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%530 = "llvm.mul"(%26, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%531 = "llvm.add"(%529, %530) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%532 = "llvm.add"(%531, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%533 = "llvm.getelementptr"(%524, %532) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
"llvm.store"(%523, %533) <{alignment = 4 : i64, ordering = 0 : i64}> : (vector<16xf32>, !llvm.ptr) -> ()
%534 = "llvm.extractvalue"(%271) <{position = array<i64: 13>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%535 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%536 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%537 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%538 = "llvm.add"(%536, %537) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%539 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%540 = "llvm.add"(%538, %539) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%541 = "llvm.mul"(%25, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%542 = "llvm.add"(%540, %541) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%543 = "llvm.add"(%542, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%544 = "llvm.getelementptr"(%535, %543) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
"llvm.store"(%534, %544) <{alignment = 4 : i64, ordering = 0 : i64}> : (vector<16xf32>, !llvm.ptr) -> ()
%545 = "llvm.extractvalue"(%271) <{position = array<i64: 14>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%546 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%547 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%548 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%549 = "llvm.add"(%547, %548) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%550 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%551 = "llvm.add"(%549, %550) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%552 = "llvm.mul"(%24, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%553 = "llvm.add"(%551, %552) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%554 = "llvm.add"(%553, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%555 = "llvm.getelementptr"(%546, %554) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
"llvm.store"(%545, %555) <{alignment = 4 : i64, ordering = 0 : i64}> : (vector<16xf32>, !llvm.ptr) -> ()
%556 = "llvm.extractvalue"(%271) <{position = array<i64: 15>}> : (!llvm.array<16 x vector<16xf32>>) -> vector<16xf32>
%557 = "llvm.getelementptr"(%54) <{elem_type = f32, rawConstantIndices = array<i32: 4227072>}> : (!llvm.ptr) -> !llvm.ptr
%558 = "llvm.mul"(%42, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%559 = "llvm.mul"(%61, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%560 = "llvm.add"(%558, %559) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%561 = "llvm.mul"(%58, %18) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%562 = "llvm.add"(%560, %561) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%563 = "llvm.mul"(%23, %37) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%564 = "llvm.add"(%562, %563) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%565 = "llvm.add"(%564, %42) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%566 = "llvm.getelementptr"(%557, %565) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
"llvm.store"(%556, %566) <{alignment = 4 : i64, ordering = 0 : i64}> : (vector<16xf32>, !llvm.ptr) -> ()
"llvm.return"(%15) : (i32) -> ()
}) : () -> ()
}) {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "embedded_elf_x86_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>} : () -> ()
%2 = linalg.batch_matmul_transpose_b ins(%arg0, %arg1 : tensor<1x32x4096xf8E4M3FNUZ>, tensor<1x4096x4096xf8E4M3FNUZ>) outs(%1 : tensor<1x32x4096xf32>) -> tensor<1x32x4096xf32>
^
Traceback (most recent call last):
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/home/chi/src/shark-ai/sharktank/sharktank/examples/paged_llm_v1.py", line 342, in <module>
main()
File "/home/chi/src/shark-ai/sharktank/sharktank/examples/paged_llm_v1.py", line 321, in main
batch.prefill()
File "/home/chi/src/shark-ai/sharktank/sharktank/examples/paged_llm_v1.py", line 173, in prefill
logits = model.prefill(
^^^^^^^^^^^^^^
File "/home/chi/src/shark-ai/sharktank/sharktank/models/llama/llama.py", line 150, in prefill
h = block(
^^^^^^
File "/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/chi/src/shark-ai/sharktank/sharktank/models/llama/llama.py", line 286, in forward
h = self.attn(
^^^^^^^^^^
File "/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/chi/src/shark-ai/sharktank/sharktank/layers/paged_llama_attention_block.py", line 107, in forward
xq = self.attn_q(x)
^^^^^^^^^^^^^^
File "/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/chi/src/shark-ai/sharktank/sharktank/layers/linear.py", line 80, in forward
y = ops.linear(x, weight, bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/chi/src/shark-ai/sharktank/sharktank/ops/_registry.py", line 199, in __call__
selected_override, *results = trampoline(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/chi/src/shark-ai/sharktank/sharktank/ops/signatures.py", line 663, in _linear_trampoline
result = override(input, weight, bias, accum_dtype=accum_dtype)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/chi/src/shark-ai/sharktank/sharktank/ops/qlinear_impls.py", line 95, in qlinear_tensor_scaled
y_qs = _invoke_mmt_kernel(x_qs, weight_qs, accum_dtype=accum_dtype)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/chi/src/shark-ai/sharktank/sharktank/ops/qlinear_impls.py", line 206, in _invoke_mmt_kernel
y_qs = kernels.batch_matmul_transpose_b(lhs, rhs, accum_dtype=accum_dtype)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/chi/src/shark-ai/sharktank/sharktank/kernels/batch_matmul_transpose_b.py", line 33, in batch_matmul_transpose_b
return _batch_matmul_transpose_b(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/torch/_ops.py", line 1116, in __call__
return self._op(*args, **(kwargs or {}))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/turbine/runtime/op_reg/base.py", line 928, in handler
return eager_dispatch(ksel)
^^^^^^^^^^^^^^^^^^^^
File "/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/turbine/runtime/op_reg/eager.py", line 98, in eager_dispatch
vm_context, vm_f, config = compile_standalone_kernel(device, ksel)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/turbine/runtime/op_reg/compiler.py", line 126, in compile_standalone_kernel
raise GeneralError(f"Kernel compilation failed. See diagnostics.")
iree.turbine.support.exceptions.GeneralError: Kernel compilation failed. See diagnostics.
@AmosLewis
Copy link
Author

With boian new commit for eager mode, https://github.com/nod-ai/shark-ai/pull/896/files/9f1c3d40f2aa7a40abd4ab6f562648f7b35e53dc..aa5c7b099e0396607127d80622e1fab243a3b694

python -m sharktank.examples.paged_llm_v1 --irpa-file=/sharedfile/llama3_8b_fp8.irpa  \
--tokenizer-config-json=/home/chi/src/test/llama/dan/tokenizer.json --dump-bins "t"
/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/turbine/aot/params.py:163: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)
  return torch.from_numpy(wrapper)
:: Prompting:
    b't'
:: Prompt tokens: tensor([[83,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])
:: Invoke prefill:
/home/chi/src/shark-ai/sharktank/sharktank/ops/qlinear_impls.py:130: UserWarning: Tensor.T is deprecated on 0-D tensors. This function is the identity in these cases. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:3691.)
  rescale_d = x_d * weight_d.T
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/chi/src/shark-ai/sharktank/sharktank/examples/paged_llm_v1.py", line 342, in <module>
    main()
  File "/home/chi/src/shark-ai/sharktank/sharktank/examples/paged_llm_v1.py", line 321, in main
    batch.prefill()
  File "/home/chi/src/shark-ai/sharktank/sharktank/examples/paged_llm_v1.py", line 173, in prefill
    logits = model.prefill(
             ^^^^^^^^^^^^^^
  File "/home/chi/src/shark-ai/sharktank/sharktank/models/llama/llama.py", line 150, in prefill
    h = block(
        ^^^^^^
  File "/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/chi/src/shark-ai/sharktank/sharktank/models/llama/llama.py", line 286, in forward
    h = self.attn(
        ^^^^^^^^^^
  File "/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/chi/src/shark-ai/sharktank/sharktank/layers/paged_llama_attention_block.py", line 135, in forward
    xk, xv = self.transact_cache(
             ^^^^^^^^^^^^^^^^^^^^
  File "/home/chi/src/shark-ai/sharktank/sharktank/layers/paged_llama_attention_block.py", line 234, in transact_cache
    cache.write(
  File "/home/chi/src/shark-ai/sharktank/sharktank/layers/kv_cache.py", line 323, in write
    subblock_table.index_copy_(0, subblock_ids, part_block_view)
RuntimeError: index_copy_(): self and source expected to have the same dtype, but got (self) Half and (source) Float8_e4m3fnuz

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment