AmosLewis · February 14, 2025 02:27
diff --git a/llama_f8_attn_bug_log_0213.txt b/llama_f8_attn_bug_log_0213.txt
 /home/chi/src/iree-build/tools/iree-compile f8_attn_chi_castf32_roctorch.mlir \
  --iree-hip-target=gfx942 \
  -o=f8_attn_chi_castf32_roctorch_0213.vmfb \
  --iree-hal-target-device=hip \
  --iree-dispatch-creation-enable-aggressive-fusion=true \
  --iree-global-opt-propagate-transposes=true \
  --iree-opt-aggressively-propagate-transposes=true \
  --iree-opt-data-tiling=false \
  --iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
  --iree-hal-indirect-command-buffers=true \
  --iree-stream-resource-memory-model=discrete \
  --iree-hal-memoization=true \
  --iree-opt-strip-assertions
 failed to translate executables
 f8_attn_chi_castf32_roctorch.mlir:45778:10: error: 'func.func' op failed to distribute
    %1 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4, #map5]} ins(%collapsed, %collapsed_1, %collapsed_2, %extracted, %arg4 : tensor<32x?x128xf8E4M3FNUZ>, tensor<32x?x128xf8E4M3FNUZ>, tensor<32x?x128xf8E4M3FNUZ>, f32, tensor<?x?xf8E4M3FNUZ>) outs(%cast : tensor<32x?x128xf32>) {
         ^
 f8_attn_chi_castf32_roctorch.mlir:2706:12: note: called from
    %914 = util.call @sharktank_masked_flash_attention_1_32_128_128_f8E4M3FNUZ_f32_f32(%909, %910, %911, %913, %912) : (tensor<1x32x?x128xf8E4M3FNUZ>, tensor<1x32x?x128xf8E4M3FNUZ>, tensor<1x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?xf8E4M3FNUZ>) -> tensor<1x32x?x128xf32>
           ^
 f8_attn_chi_castf32_roctorch.mlir:45778:10: note: see current operation:
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_18_attention_8x4x1xDx32x128xf8E4M3FNUZ_generic"}> ({
  %0 = "arith.constant"() <{value = 7 : index}> : () -> index
  %1 = "arith.constant"() <{value = 6 : index}> : () -> index
  %2 = "arith.constant"() <{value = 5 : index}> : () -> index
  %3 = "arith.constant"() <{value = 4 : index}> : () -> index
  %4 = "arith.constant"() <{value = dense<0.000000e+00> : vector<1x8x1x1x8x1xf8E4M3FNUZ>}> : () -> vector<1x8x1x1x8x1xf8E4M3FNUZ>
  %5 = "arith.constant"() <{value = dense<0.000000e+00> : vector<2x1x1x1x1x8xf8E4M3FNUZ>}> : () -> vector<2x1x1x1x1x8xf8E4M3FNUZ>
  %6 = "arith.constant"() <{value = dense<0.000000e+00> : vector<8x2x1x1x1x4xf32>}> : () -> vector<8x2x1x1x1x4xf32>
  %7 = "arith.constant"() <{value = dense<0.000000e+00> : vector<2x2x1x1x1x4xf32>}> : () -> vector<2x2x1x1x1x4xf32>
  %8 = "arith.constant"() <{value = dense<0.000000e+00> : vector<8xf32>}> : () -> vector<8xf32>
  %9 = "arith.constant"() <{value = dense<0xFF800000> : vector<2x1x4xf32>}> : () -> vector<2x1x4xf32>
  %10 = "arith.constant"() <{value = dense<0.000000e+00> : vector<2x2x1x1x4x1xf8E4M3FNUZ>}> : () -> vector<2x2x1x1x4x1xf8E4M3FNUZ>
  %11 = "arith.constant"() <{value = dense<0.000000e+00> : vector<2x4x1x1x1x8xf8E4M3FNUZ>}> : () -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
  %12 = "arith.constant"() <{value = 3 : index}> : () -> index
  %13 = "arith.constant"() <{value = 2 : index}> : () -> index
  %14 = "arith.constant"() <{value = dense<1.000000e+00> : vector<8x2x1x1x1x4xf32>}> : () -> vector<8x2x1x1x1x4xf32>
  %15 = "arith.constant"() <{value = dense<2.400000e+02> : vector<2x8x1x1x4x1xf32>}> : () -> vector<2x8x1x1x4x1xf32>
  %16 = "arith.constant"() <{value = dense<-2.400000e+02> : vector<2x8x1x1x4x1xf32>}> : () -> vector<2x8x1x1x4x1xf32>
  %17 = "arith.constant"() <{value = dense<2.400000e+02> : vector<2x2x1x1x4x1xf32>}> : () -> vector<2x2x1x1x4x1xf32>
  %18 = "arith.constant"() <{value = dense<1.44269502> : vector<2x2x1x1x4x1xf32>}> : () -> vector<2x2x1x1x4x1xf32>
  %19 = "arith.constant"() <{value = dense<0.00416666688> : vector<2x2x1x1x4x1xf32>}> : () -> vector<2x2x1x1x4x1xf32>
  %20 = "arith.constant"() <{value = dense<0xFF800000> : vector<32x32xf32>}> : () -> vector<32x32xf32>
  %21 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %22 = "arith.constant"() <{value = 0 : i8}> : () -> i8
  %23 = "arith.constant"() <{value = dense<0.000000e+00> : vector<32x32xf32>}> : () -> vector<32x32xf32>
  %24 = "arith.constant"() <{value = dense<0.000000e+00> : vector<2x2x1x1x4x1xf32>}> : () -> vector<2x2x1x1x4x1xf32>
  %25 = "arith.constant"() <{value = dense<0.000000e+00> : vector<2x1x4xf32>}> : () -> vector<2x1x4xf32>
  %26 = "arith.constant"() <{value = dense<-3.40282347E+38> : vector<2x1x4xf32>}> : () -> vector<2x1x4xf32>
  %27 = "arith.constant"() <{value = dense<0.000000e+00> : vector<2x8x1x1x4x1xf32>}> : () -> vector<2x8x1x1x4x1xf32>
  %28 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %29 = "arith.constant"() <{value = 1.44269502 : f32}> : () -> f32
  %30 = "arith.constant"() <{value = 1 : index}> : () -> index
  %31 = "arith.constant"() <{value = 32 : index}> : () -> index
  %32 = "arith.constant"() <{value = 67108864 : index}> : () -> index
  %33 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %34 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
  %35 = "arith.constant"() <{value = 0 : index}> : () -> index
  %36 = "gpu.thread_id"() <{dimension = #gpu<dim z>}> : () -> index
  %37 = "gpu.thread_id"() <{dimension = #gpu<dim y>}> : () -> index
  %38 = "gpu.thread_id"() <{dimension = #gpu<dim x>}> : () -> index
  %39 = "affine.linearize_index"(%36, %37, %38) <{disjoint, operandSegmentSizes = array<i32: 3, 0>, static_basis = array<i64: 1, 1, 64>}> : (index, index, index) -> index
  %40 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>
  %41 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
  %42 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
  %43 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
  %44 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>
  %45 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %46 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %47 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %48 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %49 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %50 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %51 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %52 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %53 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %54 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %55 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 10 : index} : () -> i32
  %56 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 11 : index} : () -> i32
  %57 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 12 : index} : () -> i32
  %58 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 13 : index} : () -> i32
  %59 = "arith.extui"(%45) : (i32) -> i64
  %60 = "arith.extui"(%46) : (i32) -> i64
  %61 = "arith.shli"(%60, %33) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %62 = "arith.ori"(%59, %61) : (i64, i64) -> i64
  %63 = "arith.index_castui"(%62) : (i64) -> index
  %64 = "arith.extui"(%47) : (i32) -> i64
  %65 = "arith.extui"(%48) : (i32) -> i64
  %66 = "arith.shli"(%65, %33) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %67 = "arith.ori"(%64, %66) : (i64, i64) -> i64
  %68 = "arith.index_castui"(%67) : (i64) -> index
  %69 = "arith.extui"(%49) : (i32) -> i64
  %70 = "arith.extui"(%50) : (i32) -> i64
  %71 = "arith.shli"(%70, %33) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %72 = "arith.ori"(%69, %71) : (i64, i64) -> i64
  %73 = "arith.index_castui"(%72) : (i64) -> index
  %74 = "arith.extui"(%51) : (i32) -> i64
  %75 = "arith.extui"(%52) : (i32) -> i64
  %76 = "arith.shli"(%75, %33) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %77 = "arith.ori"(%74, %76) : (i64, i64) -> i64
  %78 = "arith.index_castui"(%77) {stream.alignment = 64 : index, stream.values = [1075847616 : index, 1293968512 : index, 1512089408 : index, 1730210304 : index, 1948331200 : index, 2166452096 : index, 2384572992 : index, 2602693888 : index, 2820814784 : index, 3038935680 : index, 3257056576 : index, 3475177472 : index, 3693298368 : index, 3911419264 : index, 4129540160 : index, 4347661056 : index, 4565781952 : index, 4783902848 : index, 5002023744 : index, 5220144640 : index, 5438265536 : index, 5656386432 : index, 5874507328 : index, 6092628224 : index, 6310749120 : index, 6528870016 : index, 6746990912 : index, 6965111808 : index, 7183232704 : index, 7401353600 : index, 7619474496 : index, 7837595392 : index]} : (i64) -> index
  %79 = "arith.extui"(%53) : (i32) -> i64
  %80 = "arith.extui"(%54) : (i32) -> i64
  %81 = "arith.shli"(%80, %33) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %82 = "arith.ori"(%79, %81) : (i64, i64) -> i64
  %83 = "arith.index_castui"(%82) : (i64) -> index
  %84 = "arith.index_castui"(%55) : (i32) -> index
  %85 = "arith.bitcast"(%56) : (i32) -> f32
  %86 = "arith.index_castui"(%57) : (i32) -> index
  %87 = "arith.index_castui"(%58) : (i32) -> index
  %88:8 = "util.assume.int"(%63, %68, %73, %78, %83, %84, %86, %87) <{assumptions = [[#util.int.assumption<umin = 68027392, umax = 20995769344>], [#util.int.assumption<umin = 68158464, umax = 21532509184>], [#util.int.assumption<umin = 68355072, umax = 22337618944>], [#util.int.assumption<umin = 1075847616, umax = 1075847616, udiv = 1075847616>, #util.int.assumption<umin = 1293968512, umax = 1293968512, udiv = 1293968512>, #util.int.assumption<umin = 1512089408, umax = 1512089408, udiv = 1512089408>, #util.int.assumption<umin = 1730210304, umax = 1730210304, udiv = 1730210304>, #util.int.assumption<umin = 1948331200, umax = 1948331200, udiv = 1948331200>, #util.int.assumption<umin = 2166452096, umax = 2166452096, udiv = 2166452096>, #util.int.assumption<umin = 2384572992, umax = 2384572992, udiv = 2384572992>, #util.int.assumption<umin = 2602693888, umax = 2602693888, udiv = 2602693888>, #util.int.assumption<umin = 2820814784, umax = 2820814784, udiv = 2820814784>, #util.int.assumption<umin = 3038935680, umax = 3038935680, udiv = 3038935680>, #util.int.assumption<umin = 3257056576, umax = 3257056576, udiv = 3257056576>, #util.int.assumption<umin = 3475177472, umax = 3475177472, udiv = 3475177472>, #util.int.assumption<umin = 3693298368, umax = 3693298368, udiv = 3693298368>, #util.int.assumption<umin = 3911419264, umax = 3911419264, udiv = 3911419264>, #util.int.assumption<umin = 4129540160, umax = 4129540160, udiv = 4129540160>, #util.int.assumption<umin = 4347661056, umax = 4347661056, udiv = 4347661056>, #util.int.assumption<umin = 4565781952, umax = 4565781952, udiv = 4565781952>, #util.int.assumption<umin = 4783902848, umax = 4783902848, udiv = 4783902848>, #util.int.assumption<umin = 5002023744, umax = 5002023744, udiv = 5002023744>, #util.int.assumption<umin = 5220144640, umax = 5220144640, udiv = 5220144640>, #util.int.assumption<umin = 5438265536, umax = 5438265536, udiv = 5438265536>, #util.int.assumption<umin = 5656386432, umax = 5656386432, udiv = 5656386432>, #util.int.assumption<umin = 5874507328, umax = 5874507328, udiv = 5874507328>, #util.int.assumption<umin = 6092628224, umax = 6092628224, udiv = 6092628224>, #util.int.assumption<umin = 6310749120, umax = 6310749120, udiv = 6310749120>, #util.int.assumption<umin = 6528870016, umax = 6528870016, udiv = 6528870016>, #util.int.assumption<umin = 6746990912, umax = 6746990912, udiv = 6746990912>, #util.int.assumption<umin = 6965111808, umax = 6965111808, udiv = 6965111808>, #util.int.assumption<umin = 7183232704, umax = 7183232704, udiv = 7183232704>, #util.int.assumption<umin = 7401353600, umax = 7401353600, udiv = 7401353600>, #util.int.assumption<umin = 7619474496, umax = 7619474496, udiv = 7619474496>, #util.int.assumption<umin = 7837595392, umax = 7837595392, udiv = 7837595392>], [#util.int.assumption<umin = 67896320, umax = 20459029504>], [#util.int.assumption<umin = 32, umax = 131040, udiv = 32>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 32, umax = 131040, udiv = 32>]]}> : (index, index, index, index, index, index, index, index) -> (index, index, index, index, index, index, index, index)
  %89 = "hal.interface.binding.subspan"(%35) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 0>} : (index) -> memref<i64, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%89) <{alignment = 64 : i32}> : (memref<i64, #hal.descriptor_type<storage_buffer>>) -> ()
  %90 = "hal.interface.binding.subspan"(%88#3) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 1 : i32, layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 0>} : (index) -> memref<f32, strided<[], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%90) <{alignment = 64 : i32}> : (memref<f32, strided<[], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %91 = "flow.dispatch.workload.ordinal"(%88#5) <{ordinal = 0 : index}> : (index) -> index
  %92 = "flow.dispatch.workload.ordinal"(%88#6) <{ordinal = 1 : index}> : (index) -> index
  %93 = "flow.dispatch.workload.ordinal"(%88#6) <{ordinal = 2 : index}> : (index) -> index
  %94 = "flow.dispatch.workload.ordinal"(%88#7) <{ordinal = 3 : index}> : (index) -> index
  %95 = "hal.interface.binding.subspan"(%32, %92, %91) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 2>} : (index, index, index) -> memref<?x32x?xi8, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%95) <{alignment = 64 : i32}> : (memref<?x32x?xi8, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %96 = "hal.interface.binding.subspan"(%88#0, %93) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<8x4x1x?x32x128xf8E4M3FNUZ, strided<[?, ?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%96) <{alignment = 1 : i32}> : (memref<8x4x1x?x32x128xf8E4M3FNUZ, strided<[?, ?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %97 = "arith.divsi"(%94, %31) : (index, index) -> index
  %98 = "hal.interface.binding.subspan"(%88#1, %97) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<8x4x?x32x128xf8E4M3FNUZ, strided<[?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%98) <{alignment = 1 : i32}> : (memref<8x4x?x32x128xf8E4M3FNUZ, strided<[?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %99 = "arith.divsi"(%91, %31) : (index, index) -> index
  %100 = "hal.interface.binding.subspan"(%88#2, %99) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xf8E4M3FNUZ, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%100) <{alignment = 1 : i32}> : (memref<?x32x8x128xf8E4M3FNUZ, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %101 = "hal.interface.binding.subspan"(%88#4, %92) {alignment = 64 : index, binding = 3 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x4x128xf8E4M3FNUZ, strided<[?, 131072, 4096, 512, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%101) <{alignment = 1 : i32}> : (memref<1x?x32x8x4x128xf8E4M3FNUZ, strided<[?, 131072, 4096, 512, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  "scf.forall"(%93) <{mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0>, staticStep = array<i64: 1, 1, 1>, staticUpperBound = array<i64: 8, 4, -9223372036854775808>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index):
    "gpu.barrier"() : () -> ()
    %102 = "memref.subview"(%101, %arg2, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 3, 0, 0>, static_offsets = array<i64: 0, -9223372036854775808, 0, -9223372036854775808, -9223372036854775808, 0>, static_sizes = array<i64: 1, 1, 32, 1, 1, 128>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x?x32x8x4x128xf8E4M3FNUZ, strided<[?, 131072, 4096, 512, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index) -> memref<1x1x32x1x1x128xf8E4M3FNUZ, strided<[?, 131072, 4096, 512, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %103 = "memref.subview"(%102) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0, 0, 0, 0, 0>, static_sizes = array<i64: 1, 1, 32, 1, 1, 128>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x32x1x1x128xf8E4M3FNUZ, strided<[?, 131072, 4096, 512, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %104:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
    %105:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 8, 8>}> : (index) -> (index, index, index)
    %106 = "affine.linearize_index"(%104#2, %35, %35, %105#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
    %107 = "affine.linearize_index"(%104#1, %35, %35, %105#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
    %108 = "vector.transfer_read"(%96, %arg0, %arg1, %35, %arg2, %106, %107, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 6, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d5)>}> : (memref<8x4x1x?x32x128xf8E4M3FNUZ, strided<[?, ?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
    %109 = "affine.linearize_index"(%104#2, %30, %35, %105#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
    %110 = "affine.linearize_index"(%104#1, %35, %35, %105#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
    %111 = "vector.transfer_read"(%96, %arg0, %arg1, %35, %arg2, %109, %110, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 6, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d5)>}> : (memref<8x4x1x?x32x128xf8E4M3FNUZ, strided<[?, ?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
    %112 = "affine.linearize_index"(%104#2, %13, %35, %105#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
    %113 = "affine.linearize_index"(%104#1, %35, %35, %105#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
    %114 = "vector.transfer_read"(%96, %arg0, %arg1, %35, %arg2, %112, %113, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 6, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d5)>}> : (memref<8x4x1x?x32x128xf8E4M3FNUZ, strided<[?, ?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
    %115 = "affine.linearize_index"(%104#2, %12, %35, %105#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
    %116 = "affine.linearize_index"(%104#1, %35, %35, %105#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
    %117 = "vector.transfer_read"(%96, %arg0, %arg1, %35, %arg2, %115, %116, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 6, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d5)>}> : (memref<8x4x1x?x32x128xf8E4M3FNUZ, strided<[?, ?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
    %118 = "arith.mulf"(%85, %29) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
    %119:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
    %120:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 8, 8>}> : (index) -> (index, index, index)
    %121 = "affine.linearize_index"(%119#2, %35, %35, %120#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
    %122 = "affine.linearize_index"(%119#1, %35, %35, %120#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
    "vector.transfer_write"(%108, %43, %121, %122) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
    %123 = "affine.linearize_index"(%119#2, %30, %35, %120#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
    %124 = "affine.linearize_index"(%119#1, %35, %35, %120#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
    "vector.transfer_write"(%111, %43, %123, %124) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
    %125 = "affine.linearize_index"(%119#2, %13, %35, %120#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
    %126 = "affine.linearize_index"(%119#1, %35, %35, %120#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
    "vector.transfer_write"(%114, %43, %125, %126) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
    %127 = "affine.linearize_index"(%119#2, %12, %35, %120#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
    %128 = "affine.linearize_index"(%119#1, %35, %35, %120#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
    "vector.transfer_write"(%117, %43, %127, %128) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %129:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
    %130:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index, index)
    %131 = "affine.linearize_index"(%129#2, %35, %35, %130#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %132 = "affine.linearize_index"(%129#1, %35, %35, %130#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
    %133 = "vector.transfer_read"(%43, %131, %132, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
    %134 = "vector.insert_strided_slice"(%133, %11) <{offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
    %135 = "affine.linearize_index"(%129#2, %35, %35, %130#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %136 = "affine.linearize_index"(%129#1, %30, %35, %130#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
    %137 = "vector.transfer_read"(%43, %135, %136, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
    %138 = "vector.insert_strided_slice"(%137, %134) <{offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
    %139 = "affine.linearize_index"(%129#2, %35, %35, %130#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %140 = "affine.linearize_index"(%129#1, %13, %35, %130#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
    %141 = "vector.transfer_read"(%43, %139, %140, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
    %142 = "vector.insert_strided_slice"(%141, %138) <{offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
    %143 = "affine.linearize_index"(%129#2, %35, %35, %130#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %144 = "affine.linearize_index"(%129#1, %12, %35, %130#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
    %145 = "vector.transfer_read"(%43, %143, %144, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
    %146 = "vector.insert_strided_slice"(%145, %142) <{offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
    %147 = "affine.linearize_index"(%129#2, %30, %35, %130#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %148 = "affine.linearize_index"(%129#1, %35, %35, %130#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
    %149 = "vector.transfer_read"(%43, %147, %148, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
    %150 = "vector.insert_strided_slice"(%149, %146) <{offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
    %151 = "affine.linearize_index"(%129#2, %30, %35, %130#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %152 = "affine.linearize_index"(%129#1, %30, %35, %130#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
    %153 = "vector.transfer_read"(%43, %151, %152, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
    %154 = "vector.insert_strided_slice"(%153, %150) <{offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
    %155 = "affine.linearize_index"(%129#2, %30, %35, %130#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %156 = "affine.linearize_index"(%129#1, %13, %35, %130#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
    %157 = "vector.transfer_read"(%43, %155, %156, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
    %158 = "vector.insert_strided_slice"(%157, %154) <{offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
    %159 = "affine.linearize_index"(%129#2, %30, %35, %130#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %160 = "affine.linearize_index"(%129#1, %12, %35, %130#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
    %161 = "vector.transfer_read"(%43, %159, %160, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
    %162 = "vector.insert_strided_slice"(%161, %158) <{offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
    %163 = "vector.transfer_read"(%89, %21) <{in_bounds = [], operandSegmentSizes = array<i32: 1, 0, 1, 0>, permutation_map = affine_map<() -> ()>}> : (memref<i64, #hal.descriptor_type<storage_buffer>>, i64) -> vector<i64>
    %164 = "iree_vector_ext.to_simd"(%163) : (vector<i64>) -> vector<i64>
    %165 = "vector.broadcast"(%164) : (vector<i64>) -> vector<32x32xi64>
    %166 = "vector.step"() : () -> vector<32xindex>
    %167 = "vector.broadcast"(%118) : (f32) -> vector<2x2x1x1x4x1xf32>
    %168:3 = "scf.for"(%35, %97, %30, %26, %25, %27) ({
    ^bb0(%arg3: index, %arg4: vector<2x1x4xf32>, %arg5: vector<2x1x4xf32>, %arg6: vector<2x8x1x1x4x1xf32>):
      "gpu.barrier"() : () -> ()
      %325 = "memref.subview"(%100, %arg3, %arg0) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, -9223372036854775808, 0>, static_sizes = array<i64: 1, 32, 1, 128>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xf8E4M3FNUZ, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> memref<1x32x1x128xf8E4M3FNUZ, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %326 = "memref.subview"(%325) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0, 0, 0>, static_sizes = array<i64: 1, 32, 1, 128>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<1x32x1x128xf8E4M3FNUZ, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> memref<32x128xf8E4M3FNUZ, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %327:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
      %328:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 8, 8>}> : (index) -> (index, index, index)
      %329 = "affine.linearize_index"(%327#2, %35, %35, %328#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
      %330 = "affine.linearize_index"(%327#1, %35, %35, %328#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
      %331 = "vector.transfer_read"(%98, %arg0, %arg1, %arg3, %329, %330, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3, d4)>}> : (memref<8x4x?x32x128xf8E4M3FNUZ, strided<[?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
      %332 = "affine.linearize_index"(%327#2, %30, %35, %328#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
      %333 = "affine.linearize_index"(%327#1, %35, %35, %328#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
      %334 = "vector.transfer_read"(%98, %arg0, %arg1, %arg3, %332, %333, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3, d4)>}> : (memref<8x4x?x32x128xf8E4M3FNUZ, strided<[?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
      %335 = "affine.linearize_index"(%327#2, %13, %35, %328#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
      %336 = "affine.linearize_index"(%327#1, %35, %35, %328#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
      %337 = "vector.transfer_read"(%98, %arg0, %arg1, %arg3, %335, %336, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3, d4)>}> : (memref<8x4x?x32x128xf8E4M3FNUZ, strided<[?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
      %338 = "affine.linearize_index"(%327#2, %12, %35, %328#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
      %339 = "affine.linearize_index"(%327#1, %35, %35, %328#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
      %340 = "vector.transfer_read"(%98, %arg0, %arg1, %arg3, %338, %339, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3, d4)>}> : (memref<8x4x?x32x128xf8E4M3FNUZ, strided<[?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
      %341:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
      %342:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 8, 8>}> : (index) -> (index, index, index)
      %343 = "affine.linearize_index"(%341#2, %35, %35, %342#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
      %344 = "affine.linearize_index"(%341#1, %35, %35, %342#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
      %345 = "vector.transfer_read"(%326, %343, %344, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
      %346 = "affine.linearize_index"(%341#2, %30, %35, %342#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
      %347 = "affine.linearize_index"(%341#1, %35, %35, %342#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
      %348 = "vector.transfer_read"(%326, %346, %347, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
      %349 = "affine.linearize_index"(%341#2, %13, %35, %342#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
      %350 = "affine.linearize_index"(%341#1, %35, %35, %342#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
      %351 = "vector.transfer_read"(%326, %349, %350, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
      %352 = "affine.linearize_index"(%341#2, %12, %35, %342#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
      %353 = "affine.linearize_index"(%341#1, %35, %35, %342#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
      %354 = "vector.transfer_read"(%326, %352, %353, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
      %355 = "affine.linearize_index"(%arg3, %35, %99) <{disjoint, operandSegmentSizes = array<i32: 2, 1>, static_basis = array<i64: -9223372036854775808, 32>}> : (index, index, index) -> index
      %356 = "vector.transfer_read"(%95, %arg2, %35, %355, %22) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 3, 1, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d1, d2)>}> : (memref<?x32x?xi8, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, i8) -> vector<32x32xi8>
      %357 = "arith.trunci"(%356) : (vector<32x32xi8>) -> vector<32x32xi1>
      %358 = "vector.broadcast"(%355) : (index) -> vector<32xindex>
      %359 = "arith.addi"(%358, %166) <{overflowFlags = #arith.overflow<none>}> : (vector<32xindex>, vector<32xindex>) -> vector<32xindex>
      %360 = "arith.index_cast"(%359) : (vector<32xindex>) -> vector<32xi64>
      %361 = "vector.broadcast"(%360) : (vector<32xi64>) -> vector<32x32xi64>
      %362 = "arith.cmpi"(%361, %165) <{predicate = 5 : i64}> : (vector<32x32xi64>, vector<32x32xi64>) -> vector<32x32xi1>
      %363 = "arith.ori"(%357, %362) : (vector<32x32xi1>, vector<32x32xi1>) -> vector<32x32xi1>
      %364 = "arith.select"(%363, %20, %23) : (vector<32x32xi1>, vector<32x32xf32>, vector<32x32xf32>) -> vector<32x32xf32>
      %365 = "arith.truncf"(%364) : (vector<32x32xf32>) -> vector<32x32xf8E4M3FNUZ>
      "vector.transfer_write"(%365, %44, %35, %35, %35) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 3, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d1, d2)>}> : (vector<32x32xf8E4M3FNUZ>, memref<1x32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, index) -> ()
      %366 = "memref.expand_shape"(%44) <{reassociation = [[0, 1], [2], [3, 4]], static_output_shape = array<i64: 1, 1, 32, 1, 32>}> : (memref<1x32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>) -> memref<1x1x32x1x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>
      %367 = "memref.subview"(%366) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0, 0, 0, 0>, static_sizes = array<i64: 1, 1, 32, 1, 32>, static_strides = array<i64: 1, 1, 1, 1, 1>}> : (memref<1x1x32x1x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>) -> memref<32x32xf8E4M3FNUZ, strided<[32, 1]>, #gpu.address_space<workgroup>>
      %368:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
      %369:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 8, 8>}> : (index) -> (index, index, index)
      %370 = "affine.linearize_index"(%368#2, %35, %35, %369#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
      %371 = "affine.linearize_index"(%368#1, %35, %35, %369#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
      "vector.transfer_write"(%331, %42, %370, %371) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
      %372 = "affine.linearize_index"(%368#2, %30, %35, %369#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
      %373 = "affine.linearize_index"(%368#1, %35, %35, %369#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
      "vector.transfer_write"(%334, %42, %372, %373) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
      %374 = "affine.linearize_index"(%368#2, %13, %35, %369#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
      %375 = "affine.linearize_index"(%368#1, %35, %35, %369#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
      "vector.transfer_write"(%337, %42, %374, %375) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
      %376 = "affine.linearize_index"(%368#2, %12, %35, %369#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
      %377 = "affine.linearize_index"(%368#1, %35, %35, %369#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
      "vector.transfer_write"(%340, %42, %376, %377) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
      %378:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
      %379:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 8, 8>}> : (index) -> (index, index, index)
      %380 = "affine.linearize_index"(%378#2, %35, %35, %379#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
      %381 = "affine.linearize_index"(%378#1, %35, %35, %379#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
      "vector.transfer_write"(%345, %41, %380, %381) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
      %382 = "affine.linearize_index"(%378#2, %30, %35, %379#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
      %383 = "affine.linearize_index"(%378#1, %35, %35, %379#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
      "vector.transfer_write"(%348, %41, %382, %383) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
      %384 = "affine.linearize_index"(%378#2, %13, %35, %379#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
      %385 = "affine.linearize_index"(%378#1, %35, %35, %379#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
      "vector.transfer_write"(%351, %41, %384, %385) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
      %386 = "affine.linearize_index"(%378#2, %12, %35, %379#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
      %387 = "affine.linearize_index"(%378#1, %35, %35, %379#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
      "vector.transfer_write"(%354, %41, %386, %387) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
      %388:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
      %389:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index, index)
      %390 = "affine.linearize_index"(%388#2, %35, %35, %389#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
      %391 = "affine.linearize_index"(%388#1, %35, %35, %389#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %392 = "vector.transfer_read"(%367, %390, %391, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x32xf8E4M3FNUZ, strided<[32, 1]>, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<4x1xf8E4M3FNUZ>
      %393 = "vector.insert_strided_slice"(%392, %10) <{offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]}> : (vector<4x1xf8E4M3FNUZ>, vector<2x2x1x1x4x1xf8E4M3FNUZ>) -> vector<2x2x1x1x4x1xf8E4M3FNUZ>
      %394 = "affine.linearize_index"(%388#2, %35, %35, %389#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
      %395 = "affine.linearize_index"(%388#1, %30, %35, %389#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %396 = "vector.transfer_read"(%367, %394, %395, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x32xf8E4M3FNUZ, strided<[32, 1]>, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<4x1xf8E4M3FNUZ>
      %397 = "vector.insert_strided_slice"(%396, %393) <{offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]}> : (vector<4x1xf8E4M3FNUZ>, vector<2x2x1x1x4x1xf8E4M3FNUZ>) -> vector<2x2x1x1x4x1xf8E4M3FNUZ>
      %398 = "affine.linearize_index"(%388#2, %30, %35, %389#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
      %399 = "affine.linearize_index"(%388#1, %35, %35, %389#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %400 = "vector.transfer_read"(%367, %398, %399, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x32xf8E4M3FNUZ, strided<[32, 1]>, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<4x1xf8E4M3FNUZ>
      %401 = "vector.insert_strided_slice"(%400, %397) <{offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]}> : (vector<4x1xf8E4M3FNUZ>, vector<2x2x1x1x4x1xf8E4M3FNUZ>) -> vector<2x2x1x1x4x1xf8E4M3FNUZ>
      %402 = "affine.linearize_index"(%388#2, %30, %35, %389#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
      %403 = "affine.linearize_index"(%388#1, %30, %35, %389#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %404 = "vector.transfer_read"(%367, %402, %403, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x32xf8E4M3FNUZ, strided<[32, 1]>, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<4x1xf8E4M3FNUZ>
      %405 = "vector.insert_strided_slice"(%404, %401) <{offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]}> : (vector<4x1xf8E4M3FNUZ>, vector<2x2x1x1x4x1xf8E4M3FNUZ>) -> vector<2x2x1x1x4x1xf8E4M3FNUZ>
      %406 = "arith.extf"(%405) : (vector<2x2x1x1x4x1xf8E4M3FNUZ>) -> vector<2x2x1x1x4x1xf32>
      %407 = "arith.mulf"(%406, %18) <{fastmath = #arith.fastmath<none>}> : (vector<2x2x1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
      "gpu.barrier"() : () -> ()
      %408:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
      %409:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index, index)
      %410 = "affine.linearize_index"(%408#2, %35, %35, %409#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %411 = "affine.linearize_index"(%408#1, %35, %35, %409#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
      %412 = "vector.transfer_read"(%42, %410, %411, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
      %413 = "vector.insert_strided_slice"(%412, %11) <{offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
      %414 = "affine.linearize_index"(%408#2, %35, %35, %409#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %415 = "affine.linearize_index"(%408#1, %30, %35, %409#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
      %416 = "vector.transfer_read"(%42, %414, %415, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
      %417 = "vector.insert_strided_slice"(%416, %413) <{offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
      %418 = "affine.linearize_index"(%408#2, %35, %35, %409#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %419 = "affine.linearize_index"(%408#1, %13, %35, %409#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
      %420 = "vector.transfer_read"(%42, %418, %419, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
      %421 = "vector.insert_strided_slice"(%420, %417) <{offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
      %422 = "affine.linearize_index"(%408#2, %35, %35, %409#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %423 = "affine.linearize_index"(%408#1, %12, %35, %409#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
      %424 = "vector.transfer_read"(%42, %422, %423, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
      %425 = "vector.insert_strided_slice"(%424, %421) <{offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
      %426 = "affine.linearize_index"(%408#2, %30, %35, %409#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %427 = "affine.linearize_index"(%408#1, %35, %35, %409#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
      %428 = "vector.transfer_read"(%42, %426, %427, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
      %429 = "vector.insert_strided_slice"(%428, %425) <{offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
      %430 = "affine.linearize_index"(%408#2, %30, %35, %409#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %431 = "affine.linearize_index"(%408#1, %30, %35, %409#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
      %432 = "vector.transfer_read"(%42, %430, %431, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
      %433 = "vector.insert_strided_slice"(%432, %429) <{offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
      %434 = "affine.linearize_index"(%408#2, %30, %35, %409#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %435 = "affine.linearize_index"(%408#1, %13, %35, %409#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
      %436 = "vector.transfer_read"(%42, %434, %435, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
      %437 = "vector.insert_strided_slice"(%436, %433) <{offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
      %438 = "affine.linearize_index"(%408#2, %30, %35, %409#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %439 = "affine.linearize_index"(%408#1, %12, %35, %409#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
      %440 = "vector.transfer_read"(%42, %438, %439, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
      %441 = "vector.insert_strided_slice"(%440, %437) <{offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
      %442 = "vector.extract"(%24) <{static_position = array<i64: 0, 0>}> : (vector<2x2x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %443 = "vector.extract"(%162) <{static_position = array<i64: 0, 0>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %444 = "vector.extract"(%441) <{static_position = array<i64: 0, 0>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %445 = "vector.shape_cast"(%443) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %446 = "vector.shape_cast"(%444) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %447 = "vector.shape_cast"(%442) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %448 = "amdgpu.mfma"(%445, %446, %447) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %449 = "vector.extract"(%162) <{static_position = array<i64: 0, 1>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %450 = "vector.extract"(%441) <{static_position = array<i64: 0, 1>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %451 = "vector.shape_cast"(%449) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %452 = "vector.shape_cast"(%450) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %453 = "amdgpu.mfma"(%451, %452, %448) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %454 = "vector.extract"(%162) <{static_position = array<i64: 0, 2>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %455 = "vector.extract"(%441) <{static_position = array<i64: 0, 2>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %456 = "vector.shape_cast"(%454) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %457 = "vector.shape_cast"(%455) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %458 = "amdgpu.mfma"(%456, %457, %453) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %459 = "vector.extract"(%162) <{static_position = array<i64: 0, 3>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %460 = "vector.extract"(%441) <{static_position = array<i64: 0, 3>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %461 = "vector.shape_cast"(%459) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %462 = "vector.shape_cast"(%460) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %463 = "amdgpu.mfma"(%461, %462, %458) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %464 = "vector.shape_cast"(%463) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %465 = "vector.insert"(%464, %24) <{static_position = array<i64: 0, 0>}> : (vector<1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
      %466 = "vector.extract"(%24) <{static_position = array<i64: 0, 1>}> : (vector<2x2x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %467 = "vector.extract"(%162) <{static_position = array<i64: 0, 0>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %468 = "vector.extract"(%441) <{static_position = array<i64: 1, 0>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %469 = "vector.shape_cast"(%467) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %470 = "vector.shape_cast"(%468) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %471 = "vector.shape_cast"(%466) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %472 = "amdgpu.mfma"(%469, %470, %471) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %473 = "vector.extract"(%162) <{static_position = array<i64: 0, 1>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %474 = "vector.extract"(%441) <{static_position = array<i64: 1, 1>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %475 = "vector.shape_cast"(%473) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %476 = "vector.shape_cast"(%474) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %477 = "amdgpu.mfma"(%475, %476, %472) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %478 = "vector.extract"(%162) <{static_position = array<i64: 0, 2>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %479 = "vector.extract"(%441) <{static_position = array<i64: 1, 2>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %480 = "vector.shape_cast"(%478) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %481 = "vector.shape_cast"(%479) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %482 = "amdgpu.mfma"(%480, %481, %477) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %483 = "vector.extract"(%162) <{static_position = array<i64: 0, 3>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %484 = "vector.extract"(%441) <{static_position = array<i64: 1, 3>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %485 = "vector.shape_cast"(%483) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %486 = "vector.shape_cast"(%484) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %487 = "amdgpu.mfma"(%485, %486, %482) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %488 = "vector.shape_cast"(%487) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %489 = "vector.insert"(%488, %465) <{static_position = array<i64: 0, 1>}> : (vector<1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
      %490 = "vector.extract"(%24) <{static_position = array<i64: 1, 0>}> : (vector<2x2x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %491 = "vector.extract"(%162) <{static_position = array<i64: 1, 0>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %492 = "vector.extract"(%441) <{static_position = array<i64: 0, 0>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %493 = "vector.shape_cast"(%491) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %494 = "vector.shape_cast"(%492) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %495 = "vector.shape_cast"(%490) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %496 = "amdgpu.mfma"(%493, %494, %495) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %497 = "vector.extract"(%162) <{static_position = array<i64: 1, 1>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %498 = "vector.extract"(%441) <{static_position = array<i64: 0, 1>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %499 = "vector.shape_cast"(%497) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %500 = "vector.shape_cast"(%498) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %501 = "amdgpu.mfma"(%499, %500, %496) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %502 = "vector.extract"(%162) <{static_position = array<i64: 1, 2>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %503 = "vector.extract"(%441) <{static_position = array<i64: 0, 2>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %504 = "vector.shape_cast"(%502) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %505 = "vector.shape_cast"(%503) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %506 = "amdgpu.mfma"(%504, %505, %501) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %507 = "vector.extract"(%162) <{static_position = array<i64: 1, 3>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %508 = "vector.extract"(%441) <{static_position = array<i64: 0, 3>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %509 = "vector.shape_cast"(%507) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %510 = "vector.shape_cast"(%508) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %511 = "amdgpu.mfma"(%509, %510, %506) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %512 = "vector.shape_cast"(%511) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %513 = "vector.insert"(%512, %489) <{static_position = array<i64: 1, 0>}> : (vector<1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
      %514 = "vector.extract"(%24) <{static_position = array<i64: 1, 1>}> : (vector<2x2x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %515 = "vector.extract"(%162) <{static_position = array<i64: 1, 0>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %516 = "vector.extract"(%441) <{static_position = array<i64: 1, 0>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %517 = "vector.shape_cast"(%515) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %518 = "vector.shape_cast"(%516) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %519 = "vector.shape_cast"(%514) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %520 = "amdgpu.mfma"(%517, %518, %519) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %521 = "vector.extract"(%162) <{static_position = array<i64: 1, 1>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %522 = "vector.extract"(%441) <{static_position = array<i64: 1, 1>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %523 = "vector.shape_cast"(%521) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %524 = "vector.shape_cast"(%522) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %525 = "amdgpu.mfma"(%523, %524, %520) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %526 = "vector.extract"(%162) <{static_position = array<i64: 1, 2>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %527 = "vector.extract"(%441) <{static_position = array<i64: 1, 2>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %528 = "vector.shape_cast"(%526) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %529 = "vector.shape_cast"(%527) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %530 = "amdgpu.mfma"(%528, %529, %525) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %531 = "vector.extract"(%162) <{static_position = array<i64: 1, 3>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %532 = "vector.extract"(%441) <{static_position = array<i64: 1, 3>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %533 = "vector.shape_cast"(%531) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %534 = "vector.shape_cast"(%532) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %535 = "amdgpu.mfma"(%533, %534, %530) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %536 = "vector.shape_cast"(%535) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %537 = "vector.insert"(%536, %513) <{static_position = array<i64: 1, 1>}> : (vector<1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
      %538 = "arith.mulf"(%167, %537) <{fastmath = #arith.fastmath<none>}> : (vector<2x2x1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
      %539 = "arith.addf"(%538, %19) <{fastmath = #arith.fastmath<none>}> : (vector<2x2x1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
      %540 = "arith.addf"(%539, %407) <{fastmath = #arith.fastmath<none>}> : (vector<2x2x1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
      %541 = "vector.multi_reduction"(%540, %9) <{kind = #vector.kind<maximumf>, reduction_dims = array<i64: 1, 3, 5>}> : (vector<2x2x1x1x4x1xf32>, vector<2x1x4xf32>) -> vector<2x1x4xf32>
      %542 = "vector.extract"(%541) <{static_position = array<i64: 0, 0, 0>}> : (vector<2x1x4xf32>) -> f32
      %543 = "gpu.subgroup_reduce"(%542) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op maximumf>}> : (f32) -> f32
      %544 = "vector.insert"(%543, %8) <{static_position = array<i64: 0>}> : (f32, vector<8xf32>) -> vector<8xf32>
      %545 = "vector.extract"(%541) <{static_position = array<i64: 0, 0, 1>}> : (vector<2x1x4xf32>) -> f32
      %546 = "gpu.subgroup_reduce"(%545) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op maximumf>}> : (f32) -> f32
      %547 = "vector.insert"(%546, %544) <{static_position = array<i64: 1>}> : (f32, vector<8xf32>) -> vector<8xf32>
      %548 = "vector.extract"(%541) <{static_position = array<i64: 0, 0, 2>}> : (vector<2x1x4xf32>) -> f32
      %549 = "gpu.subgroup_reduce"(%548) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op maximumf>}> : (f32) -> f32
      %550 = "vector.insert"(%549, %547) <{static_position = array<i64: 2>}> : (f32, vector<8xf32>) -> vector<8xf32>
      %551 = "vector.extract"(%541) <{static_position = array<i64: 0, 0, 3>}> : (vector<2x1x4xf32>) -> f32
      %552 = "gpu.subgroup_reduce"(%551) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op maximumf>}> : (f32) -> f32
      %553 = "vector.insert"(%552, %550) <{static_position = array<i64: 3>}> : (f32, vector<8xf32>) -> vector<8xf32>
      %554 = "vector.extract"(%541) <{static_position = array<i64: 1, 0, 0>}> : (vector<2x1x4xf32>) -> f32
      %555 = "gpu.subgroup_reduce"(%554) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op maximumf>}> : (f32) -> f32
      %556 = "vector.insert"(%555, %553) <{static_position = array<i64: 4>}> : (f32, vector<8xf32>) -> vector<8xf32>
      %557 = "vector.extract"(%541) <{static_position = array<i64: 1, 0, 1>}> : (vector<2x1x4xf32>) -> f32
      %558 = "gpu.subgroup_reduce"(%557) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op maximumf>}> : (f32) -> f32
      %559 = "vector.insert"(%558, %556) <{static_position = array<i64: 5>}> : (f32, vector<8xf32>) -> vector<8xf32>
      %560 = "vector.extract"(%541) <{static_position = array<i64: 1, 0, 2>}> : (vector<2x1x4xf32>) -> f32
      %561 = "gpu.subgroup_reduce"(%560) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op maximumf>}> : (f32) -> f32
      %562 = "vector.insert"(%561, %559) <{static_position = array<i64: 6>}> : (f32, vector<8xf32>) -> vector<8xf32>
      %563 = "vector.extract"(%541) <{static_position = array<i64: 1, 0, 3>}> : (vector<2x1x4xf32>) -> f32
      %564 = "gpu.subgroup_reduce"(%563) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op maximumf>}> : (f32) -> f32
      %565 = "vector.insert"(%564, %562) <{static_position = array<i64: 7>}> : (f32, vector<8xf32>) -> vector<8xf32>
      %566 = "vector.shape_cast"(%565) : (vector<8xf32>) -> vector<2x1x4xf32>
      %567 = "arith.maximumf"(%566, %arg4) <{fastmath = #arith.fastmath<none>}> : (vector<2x1x4xf32>, vector<2x1x4xf32>) -> vector<2x1x4xf32>
      %568 = "arith.subf"(%arg4, %567) <{fastmath = #arith.fastmath<none>}> : (vector<2x1x4xf32>, vector<2x1x4xf32>) -> vector<2x1x4xf32>
      %569 = "math.exp2"(%568) <{fastmath = #arith.fastmath<none>}> : (vector<2x1x4xf32>) -> vector<2x1x4xf32>
      %570 = "arith.mulf"(%569, %arg5) <{fastmath = #arith.fastmath<none>}> : (vector<2x1x4xf32>, vector<2x1x4xf32>) -> vector<2x1x4xf32>
      %571 = "vector.extract"(%567) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %572 = "vector.broadcast"(%571) : (vector<4xf32>) -> vector<1x4xf32>
      %573 = "vector.insert"(%572, %7) <{static_position = array<i64: 0, 0, 0, 0>}> : (vector<1x4xf32>, vector<2x2x1x1x1x4xf32>) -> vector<2x2x1x1x1x4xf32>
      %574 = "vector.extract"(%567) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %575 = "vector.broadcast"(%574) : (vector<4xf32>) -> vector<1x4xf32>
      %576 = "vector.insert"(%575, %573) <{static_position = array<i64: 0, 1, 0, 0>}> : (vector<1x4xf32>, vector<2x2x1x1x1x4xf32>) -> vector<2x2x1x1x1x4xf32>
      %577 = "vector.extract"(%567) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %578 = "vector.broadcast"(%577) : (vector<4xf32>) -> vector<1x4xf32>
      %579 = "vector.insert"(%578, %576) <{static_position = array<i64: 1, 0, 0, 0>}> : (vector<1x4xf32>, vector<2x2x1x1x1x4xf32>) -> vector<2x2x1x1x1x4xf32>
      %580 = "vector.extract"(%567) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %581 = "vector.broadcast"(%580) : (vector<4xf32>) -> vector<1x4xf32>
      %582 = "vector.insert"(%581, %579) <{static_position = array<i64: 1, 1, 0, 0>}> : (vector<1x4xf32>, vector<2x2x1x1x1x4xf32>) -> vector<2x2x1x1x1x4xf32>
      %583 = "vector.transpose"(%582) <{permutation = array<i64: 1, 0, 3, 2, 5, 4>}> : (vector<2x2x1x1x1x4xf32>) -> vector<2x2x1x1x4x1xf32>
      %584 = "arith.subf"(%540, %583) <{fastmath = #arith.fastmath<none>}> : (vector<2x2x1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
      %585 = "math.exp2"(%584) <{fastmath = #arith.fastmath<none>}> : (vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
      %586 = "vector.multi_reduction"(%585, %25) <{kind = #vector.kind<add>, reduction_dims = array<i64: 1, 3, 5>}> : (vector<2x2x1x1x4x1xf32>, vector<2x1x4xf32>) -> vector<2x1x4xf32>
      %587 = "vector.extract"(%586) <{static_position = array<i64: 0, 0, 0>}> : (vector<2x1x4xf32>) -> f32
      %588 = "gpu.subgroup_reduce"(%587) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op add>}> : (f32) -> f32
      %589 = "vector.insert"(%588, %8) <{static_position = array<i64: 0>}> : (f32, vector<8xf32>) -> vector<8xf32>
      %590 = "vector.extract"(%586) <{static_position = array<i64: 0, 0, 1>}> : (vector<2x1x4xf32>) -> f32
      %591 = "gpu.subgroup_reduce"(%590) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op add>}> : (f32) -> f32
      %592 = "vector.insert"(%591, %589) <{static_position = array<i64: 1>}> : (f32, vector<8xf32>) -> vector<8xf32>
      %593 = "vector.extract"(%586) <{static_position = array<i64: 0, 0, 2>}> : (vector<2x1x4xf32>) -> f32
      %594 = "gpu.subgroup_reduce"(%593) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op add>}> : (f32) -> f32
      %595 = "vector.insert"(%594, %592) <{static_position = array<i64: 2>}> : (f32, vector<8xf32>) -> vector<8xf32>
      %596 = "vector.extract"(%586) <{static_position = array<i64: 0, 0, 3>}> : (vector<2x1x4xf32>) -> f32
      %597 = "gpu.subgroup_reduce"(%596) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op add>}> : (f32) -> f32
      %598 = "vector.insert"(%597, %595) <{static_position = array<i64: 3>}> : (f32, vector<8xf32>) -> vector<8xf32>
      %599 = "vector.extract"(%586) <{static_position = array<i64: 1, 0, 0>}> : (vector<2x1x4xf32>) -> f32
      %600 = "gpu.subgroup_reduce"(%599) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op add>}> : (f32) -> f32
      %601 = "vector.insert"(%600, %598) <{static_position = array<i64: 4>}> : (f32, vector<8xf32>) -> vector<8xf32>
      %602 = "vector.extract"(%586) <{static_position = array<i64: 1, 0, 1>}> : (vector<2x1x4xf32>) -> f32
      %603 = "gpu.subgroup_reduce"(%602) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op add>}> : (f32) -> f32
      %604 = "vector.insert"(%603, %601) <{static_position = array<i64: 5>}> : (f32, vector<8xf32>) -> vector<8xf32>
      %605 = "vector.extract"(%586) <{static_position = array<i64: 1, 0, 2>}> : (vector<2x1x4xf32>) -> f32
      %606 = "gpu.subgroup_reduce"(%605) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op add>}> : (f32) -> f32
      %607 = "vector.insert"(%606, %604) <{static_position = array<i64: 6>}> : (f32, vector<8xf32>) -> vector<8xf32>
      %608 = "vector.extract"(%586) <{static_position = array<i64: 1, 0, 3>}> : (vector<2x1x4xf32>) -> f32
      %609 = "gpu.subgroup_reduce"(%608) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op add>}> : (f32) -> f32
      %610 = "vector.insert"(%609, %607) <{static_position = array<i64: 7>}> : (f32, vector<8xf32>) -> vector<8xf32>
      %611 = "vector.shape_cast"(%610) : (vector<8xf32>) -> vector<2x1x4xf32>
      %612 = "arith.addf"(%611, %570) <{fastmath = #arith.fastmath<none>}> : (vector<2x1x4xf32>, vector<2x1x4xf32>) -> vector<2x1x4xf32>
      %613 = "arith.minimumf"(%585, %17) <{fastmath = #arith.fastmath<none>}> : (vector<2x2x1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
      %614 = "arith.truncf"(%613) : (vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf8E4M3FNUZ>
      %615 = "vector.extract"(%569) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %616 = "vector.broadcast"(%615) : (vector<4xf32>) -> vector<1x4xf32>
      %617 = "vector.insert"(%616, %6) <{static_position = array<i64: 0, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
      %618 = "vector.extract"(%569) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %619 = "vector.broadcast"(%618) : (vector<4xf32>) -> vector<1x4xf32>
      %620 = "vector.insert"(%619, %617) <{static_position = array<i64: 0, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
      %621 = "vector.extract"(%569) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %622 = "vector.broadcast"(%621) : (vector<4xf32>) -> vector<1x4xf32>
      %623 = "vector.insert"(%622, %620) <{static_position = array<i64: 1, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
      %624 = "vector.extract"(%569) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %625 = "vector.broadcast"(%624) : (vector<4xf32>) -> vector<1x4xf32>
      %626 = "vector.insert"(%625, %623) <{static_position = array<i64: 1, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
      %627 = "vector.extract"(%569) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %628 = "vector.broadcast"(%627) : (vector<4xf32>) -> vector<1x4xf32>
      %629 = "vector.insert"(%628, %626) <{static_position = array<i64: 2, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
      %630 = "vector.extract"(%569) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %631 = "vector.broadcast"(%630) : (vector<4xf32>) -> vector<1x4xf32>
      %632 = "vector.insert"(%631, %629) <{static_position = array<i64: 2, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
      %633 = "vector.extract"(%569) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %634 = "vector.broadcast"(%633) : (vector<4xf32>) -> vector<1x4xf32>
      %635 = "vector.insert"(%634, %632) <{static_position = array<i64: 3, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
      %636 = "vector.extract"(%569) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %637 = "vector.broadcast"(%636) : (vector<4xf32>) -> vector<1x4xf32>
      %638 = "vector.insert"(%637, %635) <{static_position = array<i64: 3, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
      %639 = "vector.extract"(%569) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %640 = "vector.broadcast"(%639) : (vector<4xf32>) -> vector<1x4xf32>
      %641 = "vector.insert"(%640, %638) <{static_position = array<i64: 4, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
      %642 = "vector.extract"(%569) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %643 = "vector.broadcast"(%642) : (vector<4xf32>) -> vector<1x4xf32>
      %644 = "vector.insert"(%643, %641) <{static_position = array<i64: 4, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
      %645 = "vector.extract"(%569) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %646 = "vector.broadcast"(%645) : (vector<4xf32>) -> vector<1x4xf32>
      %647 = "vector.insert"(%646, %644) <{static_position = array<i64: 5, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
      %648 = "vector.extract"(%569) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %649 = "vector.broadcast"(%648) : (vector<4xf32>) -> vector<1x4xf32>
      %650 = "vector.insert"(%649, %647) <{static_position = array<i64: 5, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
      %651 = "vector.extract"(%569) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %652 = "vector.broadcast"(%651) : (vector<4xf32>) -> vector<1x4xf32>
      %653 = "vector.insert"(%652, %650) <{static_position = array<i64: 6, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
      %654 = "vector.extract"(%569) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %655 = "vector.broadcast"(%654) : (vector<4xf32>) -> vector<1x4xf32>
      %656 = "vector.insert"(%655, %653) <{static_position = array<i64: 6, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
      %657 = "vector.extract"(%569) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %658 = "vector.broadcast"(%657) : (vector<4xf32>) -> vector<1x4xf32>
      %659 = "vector.insert"(%658, %656) <{static_position = array<i64: 7, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
      %660 = "vector.extract"(%569) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
      %661 = "vector.broadcast"(%660) : (vector<4xf32>) -> vector<1x4xf32>
      %662 = "vector.insert"(%661, %659) <{static_position = array<i64: 7, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
      %663 = "vector.transpose"(%662) <{permutation = array<i64: 1, 0, 3, 2, 5, 4>}> : (vector<8x2x1x1x1x4xf32>) -> vector<2x8x1x1x4x1xf32>
      %664 = "arith.mulf"(%663, %arg6) <{fastmath = #arith.fastmath<none>}> : (vector<2x8x1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
      %665:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
      %666:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index, index)
      %667 = "affine.linearize_index"(%665#2, %35, %35, %666#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
      %668 = "affine.linearize_index"(%665#1, %35, %35, %666#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %669 = "vector.extract"(%614) <{static_position = array<i64: 0, 0, 0, 0>}> : (vector<2x2x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
      "vector.transfer_write"(%669, %40, %667, %668) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
      %670 = "affine.linearize_index"(%665#2, %35, %35, %666#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
      %671 = "affine.linearize_index"(%665#1, %30, %35, %666#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %672 = "vector.extract"(%614) <{static_position = array<i64: 0, 1, 0, 0>}> : (vector<2x2x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
      "vector.transfer_write"(%672, %40, %670, %671) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
      %673 = "affine.linearize_index"(%665#2, %30, %35, %666#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
      %674 = "affine.linearize_index"(%665#1, %35, %35, %666#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %675 = "vector.extract"(%614) <{static_position = array<i64: 1, 0, 0, 0>}> : (vector<2x2x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
      "vector.transfer_write"(%675, %40, %673, %674) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
      %676 = "affine.linearize_index"(%665#2, %30, %35, %666#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
      %677 = "affine.linearize_index"(%665#1, %30, %35, %666#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %678 = "vector.extract"(%614) <{static_position = array<i64: 1, 1, 0, 0>}> : (vector<2x2x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
      "vector.transfer_write"(%678, %40, %676, %677) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %679:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
      %680:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index, index)
      %681 = "affine.linearize_index"(%679#2, %35, %35, %680#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %682 = "affine.linearize_index"(%679#1, %35, %35, %680#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
      %683 = "vector.transfer_read"(%40, %681, %682, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
      %684 = "vector.insert_strided_slice"(%683, %5) <{offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<2x1x1x1x1x8xf8E4M3FNUZ>
      %685 = "affine.linearize_index"(%679#2, %30, %35, %680#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %686 = "affine.linearize_index"(%679#1, %35, %35, %680#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
      %687 = "vector.transfer_read"(%40, %685, %686, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
      %688 = "vector.insert_strided_slice"(%687, %684) <{offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<2x1x1x1x1x8xf8E4M3FNUZ>
      %689:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
      %690:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index, index)
      %691 = "affine.linearize_index"(%689#2, %35, %35, %690#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
      %692 = "affine.linearize_index"(%689#1, %35, %35, %690#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %693 = "vector.transfer_read"(%41, %691, %692, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<8x1xf8E4M3FNUZ>
      %694 = "vector.insert_strided_slice"(%693, %4) <{offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]}> : (vector<8x1xf8E4M3FNUZ>, vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x8x1x1x8x1xf8E4M3FNUZ>
      %695 = "affine.linearize_index"(%689#2, %35, %35, %690#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
      %696 = "affine.linearize_index"(%689#1, %30, %35, %690#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %697 = "vector.transfer_read"(%41, %695, %696, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<8x1xf8E4M3FNUZ>
      %698 = "vector.insert_strided_slice"(%697, %694) <{offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]}> : (vector<8x1xf8E4M3FNUZ>, vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x8x1x1x8x1xf8E4M3FNUZ>
      %699 = "affine.linearize_index"(%689#2, %35, %35, %690#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
      %700 = "affine.linearize_index"(%689#1, %13, %35, %690#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %701 = "vector.transfer_read"(%41, %699, %700, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<8x1xf8E4M3FNUZ>
      %702 = "vector.insert_strided_slice"(%701, %698) <{offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]}> : (vector<8x1xf8E4M3FNUZ>, vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x8x1x1x8x1xf8E4M3FNUZ>
      %703 = "affine.linearize_index"(%689#2, %35, %35, %690#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
      %704 = "affine.linearize_index"(%689#1, %12, %35, %690#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %705 = "vector.transfer_read"(%41, %703, %704, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<8x1xf8E4M3FNUZ>
      %706 = "vector.insert_strided_slice"(%705, %702) <{offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]}> : (vector<8x1xf8E4M3FNUZ>, vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x8x1x1x8x1xf8E4M3FNUZ>
      %707 = "affine.linearize_index"(%689#2, %35, %35, %690#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
      %708 = "affine.linearize_index"(%689#1, %3, %35, %690#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %709 = "vector.transfer_read"(%41, %707, %708, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<8x1xf8E4M3FNUZ>
      %710 = "vector.insert_strided_slice"(%709, %706) <{offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]}> : (vector<8x1xf8E4M3FNUZ>, vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x8x1x1x8x1xf8E4M3FNUZ>
      %711 = "affine.linearize_index"(%689#2, %35, %35, %690#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
      %712 = "affine.linearize_index"(%689#1, %2, %35, %690#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %713 = "vector.transfer_read"(%41, %711, %712, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<8x1xf8E4M3FNUZ>
      %714 = "vector.insert_strided_slice"(%713, %710) <{offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]}> : (vector<8x1xf8E4M3FNUZ>, vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x8x1x1x8x1xf8E4M3FNUZ>
      %715 = "affine.linearize_index"(%689#2, %35, %35, %690#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
      %716 = "affine.linearize_index"(%689#1, %1, %35, %690#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %717 = "vector.transfer_read"(%41, %715, %716, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<8x1xf8E4M3FNUZ>
      %718 = "vector.insert_strided_slice"(%717, %714) <{offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]}> : (vector<8x1xf8E4M3FNUZ>, vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x8x1x1x8x1xf8E4M3FNUZ>
      %719 = "affine.linearize_index"(%689#2, %35, %35, %690#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
      %720 = "affine.linearize_index"(%689#1, %0, %35, %690#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
      %721 = "vector.transfer_read"(%41, %719, %720, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<8x1xf8E4M3FNUZ>
      %722 = "vector.insert_strided_slice"(%721, %718) <{offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]}> : (vector<8x1xf8E4M3FNUZ>, vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x8x1x1x8x1xf8E4M3FNUZ>
      %723 = "vector.extract"(%664) <{static_position = array<i64: 0, 0>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %724 = "vector.extract"(%688) <{static_position = array<i64: 0, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %725 = "vector.extract"(%722) <{static_position = array<i64: 0, 0>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
      %726 = "vector.shape_cast"(%724) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %727 = "vector.shape_cast"(%725) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %728 = "vector.shape_cast"(%723) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %729 = "amdgpu.mfma"(%726, %727, %728) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %730 = "vector.shape_cast"(%729) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %731 = "vector.insert"(%730, %27) <{static_position = array<i64: 0, 0>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
      %732 = "vector.extract"(%664) <{static_position = array<i64: 0, 1>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %733 = "vector.extract"(%688) <{static_position = array<i64: 0, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %734 = "vector.extract"(%722) <{static_position = array<i64: 0, 1>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
      %735 = "vector.shape_cast"(%733) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %736 = "vector.shape_cast"(%734) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %737 = "vector.shape_cast"(%732) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %738 = "amdgpu.mfma"(%735, %736, %737) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %739 = "vector.shape_cast"(%738) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %740 = "vector.insert"(%739, %731) <{static_position = array<i64: 0, 1>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
      %741 = "vector.extract"(%664) <{static_position = array<i64: 0, 2>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %742 = "vector.extract"(%688) <{static_position = array<i64: 0, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %743 = "vector.extract"(%722) <{static_position = array<i64: 0, 2>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
      %744 = "vector.shape_cast"(%742) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %745 = "vector.shape_cast"(%743) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %746 = "vector.shape_cast"(%741) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %747 = "amdgpu.mfma"(%744, %745, %746) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %748 = "vector.shape_cast"(%747) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %749 = "vector.insert"(%748, %740) <{static_position = array<i64: 0, 2>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
      %750 = "vector.extract"(%664) <{static_position = array<i64: 0, 3>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %751 = "vector.extract"(%688) <{static_position = array<i64: 0, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %752 = "vector.extract"(%722) <{static_position = array<i64: 0, 3>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
      %753 = "vector.shape_cast"(%751) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %754 = "vector.shape_cast"(%752) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %755 = "vector.shape_cast"(%750) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %756 = "amdgpu.mfma"(%753, %754, %755) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %757 = "vector.shape_cast"(%756) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %758 = "vector.insert"(%757, %749) <{static_position = array<i64: 0, 3>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
      %759 = "vector.extract"(%664) <{static_position = array<i64: 0, 4>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %760 = "vector.extract"(%688) <{static_position = array<i64: 0, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %761 = "vector.extract"(%722) <{static_position = array<i64: 0, 4>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
      %762 = "vector.shape_cast"(%760) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %763 = "vector.shape_cast"(%761) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %764 = "vector.shape_cast"(%759) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %765 = "amdgpu.mfma"(%762, %763, %764) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %766 = "vector.shape_cast"(%765) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %767 = "vector.insert"(%766, %758) <{static_position = array<i64: 0, 4>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
      %768 = "vector.extract"(%664) <{static_position = array<i64: 0, 5>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %769 = "vector.extract"(%688) <{static_position = array<i64: 0, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %770 = "vector.extract"(%722) <{static_position = array<i64: 0, 5>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
      %771 = "vector.shape_cast"(%769) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %772 = "vector.shape_cast"(%770) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %773 = "vector.shape_cast"(%768) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %774 = "amdgpu.mfma"(%771, %772, %773) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %775 = "vector.shape_cast"(%774) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %776 = "vector.insert"(%775, %767) <{static_position = array<i64: 0, 5>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
      %777 = "vector.extract"(%664) <{static_position = array<i64: 0, 6>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %778 = "vector.extract"(%688) <{static_position = array<i64: 0, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %779 = "vector.extract"(%722) <{static_position = array<i64: 0, 6>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
      %780 = "vector.shape_cast"(%778) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %781 = "vector.shape_cast"(%779) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %782 = "vector.shape_cast"(%777) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %783 = "amdgpu.mfma"(%780, %781, %782) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %784 = "vector.shape_cast"(%783) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %785 = "vector.insert"(%784, %776) <{static_position = array<i64: 0, 6>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
      %786 = "vector.extract"(%664) <{static_position = array<i64: 0, 7>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %787 = "vector.extract"(%688) <{static_position = array<i64: 0, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %788 = "vector.extract"(%722) <{static_position = array<i64: 0, 7>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
      %789 = "vector.shape_cast"(%787) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %790 = "vector.shape_cast"(%788) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %791 = "vector.shape_cast"(%786) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %792 = "amdgpu.mfma"(%789, %790, %791) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %793 = "vector.shape_cast"(%792) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %794 = "vector.insert"(%793, %785) <{static_position = array<i64: 0, 7>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
      %795 = "vector.extract"(%664) <{static_position = array<i64: 1, 0>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %796 = "vector.extract"(%688) <{static_position = array<i64: 1, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %797 = "vector.extract"(%722) <{static_position = array<i64: 0, 0>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
      %798 = "vector.shape_cast"(%796) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %799 = "vector.shape_cast"(%797) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %800 = "vector.shape_cast"(%795) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %801 = "amdgpu.mfma"(%798, %799, %800) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %802 = "vector.shape_cast"(%801) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %803 = "vector.insert"(%802, %794) <{static_position = array<i64: 1, 0>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
      %804 = "vector.extract"(%664) <{static_position = array<i64: 1, 1>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %805 = "vector.extract"(%688) <{static_position = array<i64: 1, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %806 = "vector.extract"(%722) <{static_position = array<i64: 0, 1>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
      %807 = "vector.shape_cast"(%805) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %808 = "vector.shape_cast"(%806) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %809 = "vector.shape_cast"(%804) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %810 = "amdgpu.mfma"(%807, %808, %809) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %811 = "vector.shape_cast"(%810) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %812 = "vector.insert"(%811, %803) <{static_position = array<i64: 1, 1>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
      %813 = "vector.extract"(%664) <{static_position = array<i64: 1, 2>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %814 = "vector.extract"(%688) <{static_position = array<i64: 1, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %815 = "vector.extract"(%722) <{static_position = array<i64: 0, 2>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
      %816 = "vector.shape_cast"(%814) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %817 = "vector.shape_cast"(%815) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %818 = "vector.shape_cast"(%813) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %819 = "amdgpu.mfma"(%816, %817, %818) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %820 = "vector.shape_cast"(%819) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %821 = "vector.insert"(%820, %812) <{static_position = array<i64: 1, 2>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
      %822 = "vector.extract"(%664) <{static_position = array<i64: 1, 3>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %823 = "vector.extract"(%688) <{static_position = array<i64: 1, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %824 = "vector.extract"(%722) <{static_position = array<i64: 0, 3>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
      %825 = "vector.shape_cast"(%823) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %826 = "vector.shape_cast"(%824) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %827 = "vector.shape_cast"(%822) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %828 = "amdgpu.mfma"(%825, %826, %827) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %829 = "vector.shape_cast"(%828) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %830 = "vector.insert"(%829, %821) <{static_position = array<i64: 1, 3>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
      %831 = "vector.extract"(%664) <{static_position = array<i64: 1, 4>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %832 = "vector.extract"(%688) <{static_position = array<i64: 1, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %833 = "vector.extract"(%722) <{static_position = array<i64: 0, 4>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
      %834 = "vector.shape_cast"(%832) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %835 = "vector.shape_cast"(%833) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %836 = "vector.shape_cast"(%831) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %837 = "amdgpu.mfma"(%834, %835, %836) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %838 = "vector.shape_cast"(%837) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %839 = "vector.insert"(%838, %830) <{static_position = array<i64: 1, 4>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
      %840 = "vector.extract"(%664) <{static_position = array<i64: 1, 5>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %841 = "vector.extract"(%688) <{static_position = array<i64: 1, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %842 = "vector.extract"(%722) <{static_position = array<i64: 0, 5>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
      %843 = "vector.shape_cast"(%841) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %844 = "vector.shape_cast"(%842) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %845 = "vector.shape_cast"(%840) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %846 = "amdgpu.mfma"(%843, %844, %845) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %847 = "vector.shape_cast"(%846) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %848 = "vector.insert"(%847, %839) <{static_position = array<i64: 1, 5>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
      %849 = "vector.extract"(%664) <{static_position = array<i64: 1, 6>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %850 = "vector.extract"(%688) <{static_position = array<i64: 1, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %851 = "vector.extract"(%722) <{static_position = array<i64: 0, 6>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
      %852 = "vector.shape_cast"(%850) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %853 = "vector.shape_cast"(%851) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %854 = "vector.shape_cast"(%849) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %855 = "amdgpu.mfma"(%852, %853, %854) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %856 = "vector.shape_cast"(%855) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %857 = "vector.insert"(%856, %848) <{static_position = array<i64: 1, 6>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
      %858 = "vector.extract"(%664) <{static_position = array<i64: 1, 7>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
      %859 = "vector.extract"(%688) <{static_position = array<i64: 1, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
      %860 = "vector.extract"(%722) <{static_position = array<i64: 0, 7>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
      %861 = "vector.shape_cast"(%859) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %862 = "vector.shape_cast"(%860) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
      %863 = "vector.shape_cast"(%858) : (vector<1x1x4x1xf32>) -> vector<4xf32>
      %864 = "amdgpu.mfma"(%861, %862, %863) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
      %865 = "vector.shape_cast"(%864) : (vector<4xf32>) -> vector<1x1x4x1xf32>
      %866 = "vector.insert"(%865, %857) <{static_position = array<i64: 1, 7>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
      "scf.yield"(%567, %612, %866) : (vector<2x1x4xf32>, vector<2x1x4xf32>, vector<2x8x1x1x4x1xf32>) -> ()
    }) : (index, index, index, vector<2x1x4xf32>, vector<2x1x4xf32>, vector<2x8x1x1x4x1xf32>) -> (vector<2x1x4xf32>, vector<2x1x4xf32>, vector<2x8x1x1x4x1xf32>)
    %169 = "vector.extract"(%168#1) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
    %170 = "vector.broadcast"(%169) : (vector<4xf32>) -> vector<1x4xf32>
    %171 = "vector.insert"(%170, %6) <{static_position = array<i64: 0, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
    %172 = "vector.extract"(%168#1) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
    %173 = "vector.broadcast"(%172) : (vector<4xf32>) -> vector<1x4xf32>
    %174 = "vector.insert"(%173, %171) <{static_position = array<i64: 0, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
    %175 = "vector.extract"(%168#1) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
    %176 = "vector.broadcast"(%175) : (vector<4xf32>) -> vector<1x4xf32>
    %177 = "vector.insert"(%176, %174) <{static_position = array<i64: 1, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
    %178 = "vector.extract"(%168#1) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
    %179 = "vector.broadcast"(%178) : (vector<4xf32>) -> vector<1x4xf32>
    %180 = "vector.insert"(%179, %177) <{static_position = array<i64: 1, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
    %181 = "vector.extract"(%168#1) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
    %182 = "vector.broadcast"(%181) : (vector<4xf32>) -> vector<1x4xf32>
    %183 = "vector.insert"(%182, %180) <{static_position = array<i64: 2, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
    %184 = "vector.extract"(%168#1) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
    %185 = "vector.broadcast"(%184) : (vector<4xf32>) -> vector<1x4xf32>
    %186 = "vector.insert"(%185, %183) <{static_position = array<i64: 2, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
    %187 = "vector.extract"(%168#1) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
    %188 = "vector.broadcast"(%187) : (vector<4xf32>) -> vector<1x4xf32>
    %189 = "vector.insert"(%188, %186) <{static_position = array<i64: 3, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
    %190 = "vector.extract"(%168#1) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
    %191 = "vector.broadcast"(%190) : (vector<4xf32>) -> vector<1x4xf32>
    %192 = "vector.insert"(%191, %189) <{static_position = array<i64: 3, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
    %193 = "vector.extract"(%168#1) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
    %194 = "vector.broadcast"(%193) : (vector<4xf32>) -> vector<1x4xf32>
    %195 = "vector.insert"(%194, %192) <{static_position = array<i64: 4, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
    %196 = "vector.extract"(%168#1) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
    %197 = "vector.broadcast"(%196) : (vector<4xf32>) -> vector<1x4xf32>
    %198 = "vector.insert"(%197, %195) <{static_position = array<i64: 4, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
    %199 = "vector.extract"(%168#1) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
    %200 = "vector.broadcast"(%199) : (vector<4xf32>) -> vector<1x4xf32>
    %201 = "vector.insert"(%200, %198) <{static_position = array<i64: 5, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
    %202 = "vector.extract"(%168#1) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
    %203 = "vector.broadcast"(%202) : (vector<4xf32>) -> vector<1x4xf32>
    %204 = "vector.insert"(%203, %201) <{static_position = array<i64: 5, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
    %205 = "vector.extract"(%168#1) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
    %206 = "vector.broadcast"(%205) : (vector<4xf32>) -> vector<1x4xf32>
    %207 = "vector.insert"(%206, %204) <{static_position = array<i64: 6, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
    %208 = "vector.extract"(%168#1) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
    %209 = "vector.broadcast"(%208) : (vector<4xf32>) -> vector<1x4xf32>
    %210 = "vector.insert"(%209, %207) <{static_position = array<i64: 6, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
    %211 = "vector.extract"(%168#1) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
    %212 = "vector.broadcast"(%211) : (vector<4xf32>) -> vector<1x4xf32>
    %213 = "vector.insert"(%212, %210) <{static_position = array<i64: 7, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
    %214 = "vector.extract"(%168#1) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
    %215 = "vector.broadcast"(%214) : (vector<4xf32>) -> vector<1x4xf32>
    %216 = "vector.insert"(%215, %213) <{static_position = array<i64: 7, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
    %217 = "arith.divf"(%14, %216) <{fastmath = #arith.fastmath<none>}> : (vector<8x2x1x1x1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
    %218 = "vector.transpose"(%217) <{permutation = array<i64: 1, 0, 3, 2, 5, 4>}> : (vector<8x2x1x1x1x4xf32>) -> vector<2x8x1x1x4x1xf32>
    %219 = "arith.mulf"(%218, %168#2) <{fastmath = #arith.fastmath<none>}> : (vector<2x8x1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %220 = "vector.transfer_read"(%90, %34) <{in_bounds = [], operandSegmentSizes = array<i32: 1, 0, 1, 0>, permutation_map = affine_map<() -> ()>}> : (memref<f32, strided<[], offset: ?>, #hal.descriptor_type<storage_buffer>>, f32) -> vector<f32>
    %221 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
    %222 = "vector.broadcast"(%221) : (f32) -> vector<4x1xf32>
    %223 = "vector.insert"(%222, %27) <{static_position = array<i64: 0, 0, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %224 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
    %225 = "vector.broadcast"(%224) : (f32) -> vector<4x1xf32>
    %226 = "vector.insert"(%225, %223) <{static_position = array<i64: 0, 1, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %227 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
    %228 = "vector.broadcast"(%227) : (f32) -> vector<4x1xf32>
    %229 = "vector.insert"(%228, %226) <{static_position = array<i64: 0, 2, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %230 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
    %231 = "vector.broadcast"(%230) : (f32) -> vector<4x1xf32>
    %232 = "vector.insert"(%231, %229) <{static_position = array<i64: 0, 3, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %233 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
    %234 = "vector.broadcast"(%233) : (f32) -> vector<4x1xf32>
    %235 = "vector.insert"(%234, %232) <{static_position = array<i64: 0, 4, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %236 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
    %237 = "vector.broadcast"(%236) : (f32) -> vector<4x1xf32>
    %238 = "vector.insert"(%237, %235) <{static_position = array<i64: 0, 5, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %239 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
    %240 = "vector.broadcast"(%239) : (f32) -> vector<4x1xf32>
    %241 = "vector.insert"(%240, %238) <{static_position = array<i64: 0, 6, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %242 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
    %243 = "vector.broadcast"(%242) : (f32) -> vector<4x1xf32>
    %244 = "vector.insert"(%243, %241) <{static_position = array<i64: 0, 7, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %245 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
    %246 = "vector.broadcast"(%245) : (f32) -> vector<4x1xf32>
    %247 = "vector.insert"(%246, %244) <{static_position = array<i64: 1, 0, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %248 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
    %249 = "vector.broadcast"(%248) : (f32) -> vector<4x1xf32>
    %250 = "vector.insert"(%249, %247) <{static_position = array<i64: 1, 1, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %251 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
    %252 = "vector.broadcast"(%251) : (f32) -> vector<4x1xf32>
    %253 = "vector.insert"(%252, %250) <{static_position = array<i64: 1, 2, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %254 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
    %255 = "vector.broadcast"(%254) : (f32) -> vector<4x1xf32>
    %256 = "vector.insert"(%255, %253) <{static_position = array<i64: 1, 3, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %257 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
    %258 = "vector.broadcast"(%257) : (f32) -> vector<4x1xf32>
    %259 = "vector.insert"(%258, %256) <{static_position = array<i64: 1, 4, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %260 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
    %261 = "vector.broadcast"(%260) : (f32) -> vector<4x1xf32>
    %262 = "vector.insert"(%261, %259) <{static_position = array<i64: 1, 5, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %263 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
    %264 = "vector.broadcast"(%263) : (f32) -> vector<4x1xf32>
    %265 = "vector.insert"(%264, %262) <{static_position = array<i64: 1, 6, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %266 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
    %267 = "vector.broadcast"(%266) : (f32) -> vector<4x1xf32>
    %268 = "vector.insert"(%267, %265) <{static_position = array<i64: 1, 7, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %269 = "arith.divf"(%219, %268) <{fastmath = #arith.fastmath<none>}> : (vector<2x8x1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %270 = "arith.cmpf"(%269, %16) <{fastmath = #arith.fastmath<none>, predicate = 11 : i64}> : (vector<2x8x1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xi1>
    %271 = "arith.select"(%270, %16, %269) : (vector<2x8x1x1x4x1xi1>, vector<2x8x1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %272 = "arith.cmpf"(%271, %15) <{fastmath = #arith.fastmath<none>, predicate = 9 : i64}> : (vector<2x8x1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xi1>
    %273 = "arith.select"(%272, %15, %271) : (vector<2x8x1x1x4x1xi1>, vector<2x8x1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
    %274 = "arith.truncf"(%273) : (vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf8E4M3FNUZ>
    %275:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
    %276:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index, index)
    %277 = "affine.linearize_index"(%275#2, %35, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
    %278 = "affine.linearize_index"(%275#1, %35, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %279 = "vector.extract"(%274) <{static_position = array<i64: 0, 0, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
    "vector.transfer_write"(%279, %103, %277, %278) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
    %280 = "affine.linearize_index"(%275#2, %35, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
    %281 = "affine.linearize_index"(%275#1, %30, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %282 = "vector.extract"(%274) <{static_position = array<i64: 0, 1, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
    "vector.transfer_write"(%282, %103, %280, %281) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
    %283 = "affine.linearize_index"(%275#2, %35, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
    %284 = "affine.linearize_index"(%275#1, %13, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %285 = "vector.extract"(%274) <{static_position = array<i64: 0, 2, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
    "vector.transfer_write"(%285, %103, %283, %284) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
    %286 = "affine.linearize_index"(%275#2, %35, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
    %287 = "affine.linearize_index"(%275#1, %12, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %288 = "vector.extract"(%274) <{static_position = array<i64: 0, 3, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
    "vector.transfer_write"(%288, %103, %286, %287) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
    %289 = "affine.linearize_index"(%275#2, %35, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
    %290 = "affine.linearize_index"(%275#1, %3, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %291 = "vector.extract"(%274) <{static_position = array<i64: 0, 4, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
    "vector.transfer_write"(%291, %103, %289, %290) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
    %292 = "affine.linearize_index"(%275#2, %35, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
    %293 = "affine.linearize_index"(%275#1, %2, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %294 = "vector.extract"(%274) <{static_position = array<i64: 0, 5, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
    "vector.transfer_write"(%294, %103, %292, %293) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
    %295 = "affine.linearize_index"(%275#2, %35, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
    %296 = "affine.linearize_index"(%275#1, %1, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %297 = "vector.extract"(%274) <{static_position = array<i64: 0, 6, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
    "vector.transfer_write"(%297, %103, %295, %296) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
    %298 = "affine.linearize_index"(%275#2, %35, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
    %299 = "affine.linearize_index"(%275#1, %0, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %300 = "vector.extract"(%274) <{static_position = array<i64: 0, 7, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
    "vector.transfer_write"(%300, %103, %298, %299) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
    %301 = "affine.linearize_index"(%275#2, %30, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
    %302 = "affine.linearize_index"(%275#1, %35, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %303 = "vector.extract"(%274) <{static_position = array<i64: 1, 0, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
    "vector.transfer_write"(%303, %103, %301, %302) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
    %304 = "affine.linearize_index"(%275#2, %30, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
    %305 = "affine.linearize_index"(%275#1, %30, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %306 = "vector.extract"(%274) <{static_position = array<i64: 1, 1, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
    "vector.transfer_write"(%306, %103, %304, %305) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
    %307 = "affine.linearize_index"(%275#2, %30, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
    %308 = "affine.linearize_index"(%275#1, %13, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %309 = "vector.extract"(%274) <{static_position = array<i64: 1, 2, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
    "vector.transfer_write"(%309, %103, %307, %308) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
    %310 = "affine.linearize_index"(%275#2, %30, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
    %311 = "affine.linearize_index"(%275#1, %12, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %312 = "vector.extract"(%274) <{static_position = array<i64: 1, 3, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
    "vector.transfer_write"(%312, %103, %310, %311) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
    %313 = "affine.linearize_index"(%275#2, %30, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
    %314 = "affine.linearize_index"(%275#1, %3, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %315 = "vector.extract"(%274) <{static_position = array<i64: 1, 4, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
    "vector.transfer_write"(%315, %103, %313, %314) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
    %316 = "affine.linearize_index"(%275#2, %30, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
    %317 = "affine.linearize_index"(%275#1, %2, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %318 = "vector.extract"(%274) <{static_position = array<i64: 1, 5, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
    "vector.transfer_write"(%318, %103, %316, %317) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
    %319 = "affine.linearize_index"(%275#2, %30, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
    %320 = "affine.linearize_index"(%275#1, %1, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %321 = "vector.extract"(%274) <{static_position = array<i64: 1, 6, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
    "vector.transfer_write"(%321, %103, %319, %320) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
    %322 = "affine.linearize_index"(%275#2, %30, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
    %323 = "affine.linearize_index"(%275#1, %0, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
    %324 = "vector.extract"(%274) <{static_position = array<i64: 1, 7, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
    "vector.transfer_write"(%324, %103, %322, %323) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%44) : (memref<1x32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>) -> ()
  "memref.dealloc"(%43) : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>) -> ()
  "memref.dealloc"(%42) : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>) -> ()
  "memref.dealloc"(%41) : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>) -> ()
  "memref.dealloc"(%40) : (memref<32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} : () -> ()
    %1 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4, #map5]} ins(%collapsed, %collapsed_1, %collapsed_2, %extracted, %arg4 : tensor<32x?x128xf8E4M3FNUZ>, tensor<32x?x128xf8E4M3FNUZ>, tensor<32x?x128xf8E4M3FNUZ>, f32, tensor<?x?xf8E4M3FNUZ>) outs(%cast : tensor<32x?x128xf32>) {
         ^
 f8_attn_chi_castf32_roctorch.mlir:45778:10: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %1 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4, #map5]} ins(%collapsed, %collapsed_1, %collapsed_2, %extracted, %arg4 : tensor<32x?x128xf8E4M3FNUZ>, tensor<32x?x128xf8E4M3FNUZ>, tensor<32x?x128xf8E4M3FNUZ>, f32, tensor<?x?xf8E4M3FNUZ>) outs(%cast : tensor<32x?x128xf32>) {
         ^
 f8_attn_chi_castf32_roctorch.mlir:2706:12: note: called from
    %914 = util.call @sharktank_masked_flash_attention_1_32_128_128_f8E4M3FNUZ_f32_f32(%909, %910, %911, %913, %912) : (tensor<1x32x?x128xf8E4M3FNUZ>, tensor<1x32x?x128xf8E4M3FNUZ>, tensor<1x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?xf8E4M3FNUZ>) -> tensor<1x32x?x128xf32>
           ^
 f8_attn_chi_castf32_roctorch.mlir:45778:10: note: see current operation:
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg7: !hal.device, %arg8: index, %arg9: index, %arg10: index, %arg11: index):
    %867:3 = "flow.dispatch.workgroup_count_from_slice"(%arg8, %arg9, %arg10, %arg11) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%867#0, %867#1, %867#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_18_attention_8x4x1xDx32x128xf8E4M3FNUZ_generic"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_18_attention_8x4x1xDx32x128xf8E4M3FNUZ_generic"}> ({
      %0 = "arith.constant"() <{value = 7 : index}> : () -> index
      %1 = "arith.constant"() <{value = 6 : index}> : () -> index
      %2 = "arith.constant"() <{value = 5 : index}> : () -> index
      %3 = "arith.constant"() <{value = 4 : index}> : () -> index
      %4 = "arith.constant"() <{value = dense<0.000000e+00> : vector<1x8x1x1x8x1xf8E4M3FNUZ>}> : () -> vector<1x8x1x1x8x1xf8E4M3FNUZ>
      %5 = "arith.constant"() <{value = dense<0.000000e+00> : vector<2x1x1x1x1x8xf8E4M3FNUZ>}> : () -> vector<2x1x1x1x1x8xf8E4M3FNUZ>
      %6 = "arith.constant"() <{value = dense<0.000000e+00> : vector<8x2x1x1x1x4xf32>}> : () -> vector<8x2x1x1x1x4xf32>
      %7 = "arith.constant"() <{value = dense<0.000000e+00> : vector<2x2x1x1x1x4xf32>}> : () -> vector<2x2x1x1x1x4xf32>
      %8 = "arith.constant"() <{value = dense<0.000000e+00> : vector<8xf32>}> : () -> vector<8xf32>
      %9 = "arith.constant"() <{value = dense<0xFF800000> : vector<2x1x4xf32>}> : () -> vector<2x1x4xf32>
      %10 = "arith.constant"() <{value = dense<0.000000e+00> : vector<2x2x1x1x4x1xf8E4M3FNUZ>}> : () -> vector<2x2x1x1x4x1xf8E4M3FNUZ>
      %11 = "arith.constant"() <{value = dense<0.000000e+00> : vector<2x4x1x1x1x8xf8E4M3FNUZ>}> : () -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
      %12 = "arith.constant"() <{value = 3 : index}> : () -> index
      %13 = "arith.constant"() <{value = 2 : index}> : () -> index
      %14 = "arith.constant"() <{value = dense<1.000000e+00> : vector<8x2x1x1x1x4xf32>}> : () -> vector<8x2x1x1x1x4xf32>
      %15 = "arith.constant"() <{value = dense<2.400000e+02> : vector<2x8x1x1x4x1xf32>}> : () -> vector<2x8x1x1x4x1xf32>
      %16 = "arith.constant"() <{value = dense<-2.400000e+02> : vector<2x8x1x1x4x1xf32>}> : () -> vector<2x8x1x1x4x1xf32>
      %17 = "arith.constant"() <{value = dense<2.400000e+02> : vector<2x2x1x1x4x1xf32>}> : () -> vector<2x2x1x1x4x1xf32>
      %18 = "arith.constant"() <{value = dense<1.44269502> : vector<2x2x1x1x4x1xf32>}> : () -> vector<2x2x1x1x4x1xf32>
      %19 = "arith.constant"() <{value = dense<0.00416666688> : vector<2x2x1x1x4x1xf32>}> : () -> vector<2x2x1x1x4x1xf32>
      %20 = "arith.constant"() <{value = dense<0xFF800000> : vector<32x32xf32>}> : () -> vector<32x32xf32>
      %21 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %22 = "arith.constant"() <{value = 0 : i8}> : () -> i8
      %23 = "arith.constant"() <{value = dense<0.000000e+00> : vector<32x32xf32>}> : () -> vector<32x32xf32>
      %24 = "arith.constant"() <{value = dense<0.000000e+00> : vector<2x2x1x1x4x1xf32>}> : () -> vector<2x2x1x1x4x1xf32>
      %25 = "arith.constant"() <{value = dense<0.000000e+00> : vector<2x1x4xf32>}> : () -> vector<2x1x4xf32>
      %26 = "arith.constant"() <{value = dense<-3.40282347E+38> : vector<2x1x4xf32>}> : () -> vector<2x1x4xf32>
      %27 = "arith.constant"() <{value = dense<0.000000e+00> : vector<2x8x1x1x4x1xf32>}> : () -> vector<2x8x1x1x4x1xf32>
      %28 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %29 = "arith.constant"() <{value = 1.44269502 : f32}> : () -> f32
      %30 = "arith.constant"() <{value = 1 : index}> : () -> index
      %31 = "arith.constant"() <{value = 32 : index}> : () -> index
      %32 = "arith.constant"() <{value = 67108864 : index}> : () -> index
      %33 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %34 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
      %35 = "arith.constant"() <{value = 0 : index}> : () -> index
      %36 = "gpu.thread_id"() <{dimension = #gpu<dim z>}> : () -> index
      %37 = "gpu.thread_id"() <{dimension = #gpu<dim y>}> : () -> index
      %38 = "gpu.thread_id"() <{dimension = #gpu<dim x>}> : () -> index
      %39 = "affine.linearize_index"(%36, %37, %38) <{disjoint, operandSegmentSizes = array<i32: 3, 0>, static_basis = array<i64: 1, 1, 64>}> : (index, index, index) -> index
      %40 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>
      %41 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
      %42 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
      %43 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
      %44 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>
      %45 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %46 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %47 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %48 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %49 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %50 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %51 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %52 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %53 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %54 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %55 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 10 : index} : () -> i32
      %56 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 11 : index} : () -> i32
      %57 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 12 : index} : () -> i32
      %58 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 13 : index} : () -> i32
      %59 = "arith.extui"(%45) : (i32) -> i64
      %60 = "arith.extui"(%46) : (i32) -> i64
      %61 = "arith.shli"(%60, %33) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %62 = "arith.ori"(%59, %61) : (i64, i64) -> i64
      %63 = "arith.index_castui"(%62) : (i64) -> index
      %64 = "arith.extui"(%47) : (i32) -> i64
      %65 = "arith.extui"(%48) : (i32) -> i64
      %66 = "arith.shli"(%65, %33) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %67 = "arith.ori"(%64, %66) : (i64, i64) -> i64
      %68 = "arith.index_castui"(%67) : (i64) -> index
      %69 = "arith.extui"(%49) : (i32) -> i64
      %70 = "arith.extui"(%50) : (i32) -> i64
      %71 = "arith.shli"(%70, %33) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %72 = "arith.ori"(%69, %71) : (i64, i64) -> i64
      %73 = "arith.index_castui"(%72) : (i64) -> index
      %74 = "arith.extui"(%51) : (i32) -> i64
      %75 = "arith.extui"(%52) : (i32) -> i64
      %76 = "arith.shli"(%75, %33) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %77 = "arith.ori"(%74, %76) : (i64, i64) -> i64
      %78 = "arith.index_castui"(%77) {stream.alignment = 64 : index, stream.values = [1075847616 : index, 1293968512 : index, 1512089408 : index, 1730210304 : index, 1948331200 : index, 2166452096 : index, 2384572992 : index, 2602693888 : index, 2820814784 : index, 3038935680 : index, 3257056576 : index, 3475177472 : index, 3693298368 : index, 3911419264 : index, 4129540160 : index, 4347661056 : index, 4565781952 : index, 4783902848 : index, 5002023744 : index, 5220144640 : index, 5438265536 : index, 5656386432 : index, 5874507328 : index, 6092628224 : index, 6310749120 : index, 6528870016 : index, 6746990912 : index, 6965111808 : index, 7183232704 : index, 7401353600 : index, 7619474496 : index, 7837595392 : index]} : (i64) -> index
      %79 = "arith.extui"(%53) : (i32) -> i64
      %80 = "arith.extui"(%54) : (i32) -> i64
      %81 = "arith.shli"(%80, %33) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %82 = "arith.ori"(%79, %81) : (i64, i64) -> i64
      %83 = "arith.index_castui"(%82) : (i64) -> index
      %84 = "arith.index_castui"(%55) : (i32) -> index
      %85 = "arith.bitcast"(%56) : (i32) -> f32
      %86 = "arith.index_castui"(%57) : (i32) -> index
      %87 = "arith.index_castui"(%58) : (i32) -> index
      %88:8 = "util.assume.int"(%63, %68, %73, %78, %83, %84, %86, %87) <{assumptions = [[#util.int.assumption<umin = 68027392, umax = 20995769344>], [#util.int.assumption<umin = 68158464, umax = 21532509184>], [#util.int.assumption<umin = 68355072, umax = 22337618944>], [#util.int.assumption<umin = 1075847616, umax = 1075847616, udiv = 1075847616>, #util.int.assumption<umin = 1293968512, umax = 1293968512, udiv = 1293968512>, #util.int.assumption<umin = 1512089408, umax = 1512089408, udiv = 1512089408>, #util.int.assumption<umin = 1730210304, umax = 1730210304, udiv = 1730210304>, #util.int.assumption<umin = 1948331200, umax = 1948331200, udiv = 1948331200>, #util.int.assumption<umin = 2166452096, umax = 2166452096, udiv = 2166452096>, #util.int.assumption<umin = 2384572992, umax = 2384572992, udiv = 2384572992>, #util.int.assumption<umin = 2602693888, umax = 2602693888, udiv = 2602693888>, #util.int.assumption<umin = 2820814784, umax = 2820814784, udiv = 2820814784>, #util.int.assumption<umin = 3038935680, umax = 3038935680, udiv = 3038935680>, #util.int.assumption<umin = 3257056576, umax = 3257056576, udiv = 3257056576>, #util.int.assumption<umin = 3475177472, umax = 3475177472, udiv = 3475177472>, #util.int.assumption<umin = 3693298368, umax = 3693298368, udiv = 3693298368>, #util.int.assumption<umin = 3911419264, umax = 3911419264, udiv = 3911419264>, #util.int.assumption<umin = 4129540160, umax = 4129540160, udiv = 4129540160>, #util.int.assumption<umin = 4347661056, umax = 4347661056, udiv = 4347661056>, #util.int.assumption<umin = 4565781952, umax = 4565781952, udiv = 4565781952>, #util.int.assumption<umin = 4783902848, umax = 4783902848, udiv = 4783902848>, #util.int.assumption<umin = 5002023744, umax = 5002023744, udiv = 5002023744>, #util.int.assumption<umin = 5220144640, umax = 5220144640, udiv = 5220144640>, #util.int.assumption<umin = 5438265536, umax = 5438265536, udiv = 5438265536>, #util.int.assumption<umin = 5656386432, umax = 5656386432, udiv = 5656386432>, #util.int.assumption<umin = 5874507328, umax = 5874507328, udiv = 5874507328>, #util.int.assumption<umin = 6092628224, umax = 6092628224, udiv = 6092628224>, #util.int.assumption<umin = 6310749120, umax = 6310749120, udiv = 6310749120>, #util.int.assumption<umin = 6528870016, umax = 6528870016, udiv = 6528870016>, #util.int.assumption<umin = 6746990912, umax = 6746990912, udiv = 6746990912>, #util.int.assumption<umin = 6965111808, umax = 6965111808, udiv = 6965111808>, #util.int.assumption<umin = 7183232704, umax = 7183232704, udiv = 7183232704>, #util.int.assumption<umin = 7401353600, umax = 7401353600, udiv = 7401353600>, #util.int.assumption<umin = 7619474496, umax = 7619474496, udiv = 7619474496>, #util.int.assumption<umin = 7837595392, umax = 7837595392, udiv = 7837595392>], [#util.int.assumption<umin = 67896320, umax = 20459029504>], [#util.int.assumption<umin = 32, umax = 131040, udiv = 32>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 32, umax = 131040, udiv = 32>]]}> : (index, index, index, index, index, index, index, index) -> (index, index, index, index, index, index, index, index)
      %89 = "hal.interface.binding.subspan"(%35) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 0>} : (index) -> memref<i64, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%89) <{alignment = 64 : i32}> : (memref<i64, #hal.descriptor_type<storage_buffer>>) -> ()
      %90 = "hal.interface.binding.subspan"(%88#3) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 1 : i32, layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 0>} : (index) -> memref<f32, strided<[], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%90) <{alignment = 64 : i32}> : (memref<f32, strided<[], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %91 = "flow.dispatch.workload.ordinal"(%88#5) <{ordinal = 0 : index}> : (index) -> index
      %92 = "flow.dispatch.workload.ordinal"(%88#6) <{ordinal = 1 : index}> : (index) -> index
      %93 = "flow.dispatch.workload.ordinal"(%88#6) <{ordinal = 2 : index}> : (index) -> index
      %94 = "flow.dispatch.workload.ordinal"(%88#7) <{ordinal = 3 : index}> : (index) -> index
      %95 = "hal.interface.binding.subspan"(%32, %92, %91) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 2>} : (index, index, index) -> memref<?x32x?xi8, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%95) <{alignment = 64 : i32}> : (memref<?x32x?xi8, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %96 = "hal.interface.binding.subspan"(%88#0, %93) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<8x4x1x?x32x128xf8E4M3FNUZ, strided<[?, ?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%96) <{alignment = 1 : i32}> : (memref<8x4x1x?x32x128xf8E4M3FNUZ, strided<[?, ?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %97 = "arith.divsi"(%94, %31) : (index, index) -> index
      %98 = "hal.interface.binding.subspan"(%88#1, %97) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<8x4x?x32x128xf8E4M3FNUZ, strided<[?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%98) <{alignment = 1 : i32}> : (memref<8x4x?x32x128xf8E4M3FNUZ, strided<[?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %99 = "arith.divsi"(%91, %31) : (index, index) -> index
      %100 = "hal.interface.binding.subspan"(%88#2, %99) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xf8E4M3FNUZ, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%100) <{alignment = 1 : i32}> : (memref<?x32x8x128xf8E4M3FNUZ, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %101 = "hal.interface.binding.subspan"(%88#4, %92) {alignment = 64 : index, binding = 3 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 14, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x4x128xf8E4M3FNUZ, strided<[?, 131072, 4096, 512, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%101) <{alignment = 1 : i32}> : (memref<1x?x32x8x4x128xf8E4M3FNUZ, strided<[?, 131072, 4096, 512, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      "scf.forall"(%93) <{mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0>, staticStep = array<i64: 1, 1, 1>, staticUpperBound = array<i64: 8, 4, -9223372036854775808>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index):
        "gpu.barrier"() : () -> ()
        %102 = "memref.subview"(%101, %arg2, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 3, 0, 0>, static_offsets = array<i64: 0, -9223372036854775808, 0, -9223372036854775808, -9223372036854775808, 0>, static_sizes = array<i64: 1, 1, 32, 1, 1, 128>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x?x32x8x4x128xf8E4M3FNUZ, strided<[?, 131072, 4096, 512, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index) -> memref<1x1x32x1x1x128xf8E4M3FNUZ, strided<[?, 131072, 4096, 512, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %103 = "memref.subview"(%102) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0, 0, 0, 0, 0>, static_sizes = array<i64: 1, 1, 32, 1, 1, 128>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x32x1x1x128xf8E4M3FNUZ, strided<[?, 131072, 4096, 512, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %104:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
        %105:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 8, 8>}> : (index) -> (index, index, index)
        %106 = "affine.linearize_index"(%104#2, %35, %35, %105#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
        %107 = "affine.linearize_index"(%104#1, %35, %35, %105#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
        %108 = "vector.transfer_read"(%96, %arg0, %arg1, %35, %arg2, %106, %107, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 6, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d5)>}> : (memref<8x4x1x?x32x128xf8E4M3FNUZ, strided<[?, ?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
        %109 = "affine.linearize_index"(%104#2, %30, %35, %105#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
        %110 = "affine.linearize_index"(%104#1, %35, %35, %105#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
        %111 = "vector.transfer_read"(%96, %arg0, %arg1, %35, %arg2, %109, %110, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 6, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d5)>}> : (memref<8x4x1x?x32x128xf8E4M3FNUZ, strided<[?, ?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
        %112 = "affine.linearize_index"(%104#2, %13, %35, %105#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
        %113 = "affine.linearize_index"(%104#1, %35, %35, %105#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
        %114 = "vector.transfer_read"(%96, %arg0, %arg1, %35, %arg2, %112, %113, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 6, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d5)>}> : (memref<8x4x1x?x32x128xf8E4M3FNUZ, strided<[?, ?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
        %115 = "affine.linearize_index"(%104#2, %12, %35, %105#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
        %116 = "affine.linearize_index"(%104#1, %35, %35, %105#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
        %117 = "vector.transfer_read"(%96, %arg0, %arg1, %35, %arg2, %115, %116, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 6, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d5)>}> : (memref<8x4x1x?x32x128xf8E4M3FNUZ, strided<[?, ?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
        %118 = "arith.mulf"(%85, %29) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
        %119:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
        %120:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 8, 8>}> : (index) -> (index, index, index)
        %121 = "affine.linearize_index"(%119#2, %35, %35, %120#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
        %122 = "affine.linearize_index"(%119#1, %35, %35, %120#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
        "vector.transfer_write"(%108, %43, %121, %122) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
        %123 = "affine.linearize_index"(%119#2, %30, %35, %120#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
        %124 = "affine.linearize_index"(%119#1, %35, %35, %120#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
        "vector.transfer_write"(%111, %43, %123, %124) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
        %125 = "affine.linearize_index"(%119#2, %13, %35, %120#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
        %126 = "affine.linearize_index"(%119#1, %35, %35, %120#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
        "vector.transfer_write"(%114, %43, %125, %126) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
        %127 = "affine.linearize_index"(%119#2, %12, %35, %120#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
        %128 = "affine.linearize_index"(%119#1, %35, %35, %120#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
        "vector.transfer_write"(%117, %43, %127, %128) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %129:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
        %130:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index, index)
        %131 = "affine.linearize_index"(%129#2, %35, %35, %130#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %132 = "affine.linearize_index"(%129#1, %35, %35, %130#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
        %133 = "vector.transfer_read"(%43, %131, %132, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
        %134 = "vector.insert_strided_slice"(%133, %11) <{offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
        %135 = "affine.linearize_index"(%129#2, %35, %35, %130#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %136 = "affine.linearize_index"(%129#1, %30, %35, %130#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
        %137 = "vector.transfer_read"(%43, %135, %136, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
        %138 = "vector.insert_strided_slice"(%137, %134) <{offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
        %139 = "affine.linearize_index"(%129#2, %35, %35, %130#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %140 = "affine.linearize_index"(%129#1, %13, %35, %130#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
        %141 = "vector.transfer_read"(%43, %139, %140, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
        %142 = "vector.insert_strided_slice"(%141, %138) <{offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
        %143 = "affine.linearize_index"(%129#2, %35, %35, %130#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %144 = "affine.linearize_index"(%129#1, %12, %35, %130#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
        %145 = "vector.transfer_read"(%43, %143, %144, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
        %146 = "vector.insert_strided_slice"(%145, %142) <{offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
        %147 = "affine.linearize_index"(%129#2, %30, %35, %130#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %148 = "affine.linearize_index"(%129#1, %35, %35, %130#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
        %149 = "vector.transfer_read"(%43, %147, %148, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
        %150 = "vector.insert_strided_slice"(%149, %146) <{offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
        %151 = "affine.linearize_index"(%129#2, %30, %35, %130#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %152 = "affine.linearize_index"(%129#1, %30, %35, %130#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
        %153 = "vector.transfer_read"(%43, %151, %152, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
        %154 = "vector.insert_strided_slice"(%153, %150) <{offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
        %155 = "affine.linearize_index"(%129#2, %30, %35, %130#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %156 = "affine.linearize_index"(%129#1, %13, %35, %130#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
        %157 = "vector.transfer_read"(%43, %155, %156, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
        %158 = "vector.insert_strided_slice"(%157, %154) <{offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
        %159 = "affine.linearize_index"(%129#2, %30, %35, %130#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %160 = "affine.linearize_index"(%129#1, %12, %35, %130#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
        %161 = "vector.transfer_read"(%43, %159, %160, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
        %162 = "vector.insert_strided_slice"(%161, %158) <{offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
        %163 = "vector.transfer_read"(%89, %21) <{in_bounds = [], operandSegmentSizes = array<i32: 1, 0, 1, 0>, permutation_map = affine_map<() -> ()>}> : (memref<i64, #hal.descriptor_type<storage_buffer>>, i64) -> vector<i64>
        %164 = "iree_vector_ext.to_simd"(%163) : (vector<i64>) -> vector<i64>
        %165 = "vector.broadcast"(%164) : (vector<i64>) -> vector<32x32xi64>
        %166 = "vector.step"() : () -> vector<32xindex>
        %167 = "vector.broadcast"(%118) : (f32) -> vector<2x2x1x1x4x1xf32>
        %168:3 = "scf.for"(%35, %97, %30, %26, %25, %27) ({
        ^bb0(%arg3: index, %arg4: vector<2x1x4xf32>, %arg5: vector<2x1x4xf32>, %arg6: vector<2x8x1x1x4x1xf32>):
          "gpu.barrier"() : () -> ()
          %325 = "memref.subview"(%100, %arg3, %arg0) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, -9223372036854775808, 0>, static_sizes = array<i64: 1, 32, 1, 128>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xf8E4M3FNUZ, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> memref<1x32x1x128xf8E4M3FNUZ, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %326 = "memref.subview"(%325) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0, 0, 0>, static_sizes = array<i64: 1, 32, 1, 128>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<1x32x1x128xf8E4M3FNUZ, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> memref<32x128xf8E4M3FNUZ, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %327:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
          %328:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 8, 8>}> : (index) -> (index, index, index)
          %329 = "affine.linearize_index"(%327#2, %35, %35, %328#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
          %330 = "affine.linearize_index"(%327#1, %35, %35, %328#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
          %331 = "vector.transfer_read"(%98, %arg0, %arg1, %arg3, %329, %330, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3, d4)>}> : (memref<8x4x?x32x128xf8E4M3FNUZ, strided<[?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
          %332 = "affine.linearize_index"(%327#2, %30, %35, %328#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
          %333 = "affine.linearize_index"(%327#1, %35, %35, %328#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
          %334 = "vector.transfer_read"(%98, %arg0, %arg1, %arg3, %332, %333, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3, d4)>}> : (memref<8x4x?x32x128xf8E4M3FNUZ, strided<[?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
          %335 = "affine.linearize_index"(%327#2, %13, %35, %328#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
          %336 = "affine.linearize_index"(%327#1, %35, %35, %328#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
          %337 = "vector.transfer_read"(%98, %arg0, %arg1, %arg3, %335, %336, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3, d4)>}> : (memref<8x4x?x32x128xf8E4M3FNUZ, strided<[?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
          %338 = "affine.linearize_index"(%327#2, %12, %35, %328#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
          %339 = "affine.linearize_index"(%327#1, %35, %35, %328#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
          %340 = "vector.transfer_read"(%98, %arg0, %arg1, %arg3, %338, %339, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3, d4)>}> : (memref<8x4x?x32x128xf8E4M3FNUZ, strided<[?, ?, 4096, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
          %341:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
          %342:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 8, 8>}> : (index) -> (index, index, index)
          %343 = "affine.linearize_index"(%341#2, %35, %35, %342#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
          %344 = "affine.linearize_index"(%341#1, %35, %35, %342#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
          %345 = "vector.transfer_read"(%326, %343, %344, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
          %346 = "affine.linearize_index"(%341#2, %30, %35, %342#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
          %347 = "affine.linearize_index"(%341#1, %35, %35, %342#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
          %348 = "vector.transfer_read"(%326, %346, %347, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
          %349 = "affine.linearize_index"(%341#2, %13, %35, %342#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
          %350 = "affine.linearize_index"(%341#1, %35, %35, %342#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
          %351 = "vector.transfer_read"(%326, %349, %350, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
          %352 = "affine.linearize_index"(%341#2, %12, %35, %342#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
          %353 = "affine.linearize_index"(%341#1, %35, %35, %342#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
          %354 = "vector.transfer_read"(%326, %352, %353, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, f8E4M3FNUZ) -> vector<1x16xf8E4M3FNUZ>
          %355 = "affine.linearize_index"(%arg3, %35, %99) <{disjoint, operandSegmentSizes = array<i32: 2, 1>, static_basis = array<i64: -9223372036854775808, 32>}> : (index, index, index) -> index
          %356 = "vector.transfer_read"(%95, %arg2, %35, %355, %22) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 3, 1, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d1, d2)>}> : (memref<?x32x?xi8, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, i8) -> vector<32x32xi8>
          %357 = "arith.trunci"(%356) : (vector<32x32xi8>) -> vector<32x32xi1>
          %358 = "vector.broadcast"(%355) : (index) -> vector<32xindex>
          %359 = "arith.addi"(%358, %166) <{overflowFlags = #arith.overflow<none>}> : (vector<32xindex>, vector<32xindex>) -> vector<32xindex>
          %360 = "arith.index_cast"(%359) : (vector<32xindex>) -> vector<32xi64>
          %361 = "vector.broadcast"(%360) : (vector<32xi64>) -> vector<32x32xi64>
          %362 = "arith.cmpi"(%361, %165) <{predicate = 5 : i64}> : (vector<32x32xi64>, vector<32x32xi64>) -> vector<32x32xi1>
          %363 = "arith.ori"(%357, %362) : (vector<32x32xi1>, vector<32x32xi1>) -> vector<32x32xi1>
          %364 = "arith.select"(%363, %20, %23) : (vector<32x32xi1>, vector<32x32xf32>, vector<32x32xf32>) -> vector<32x32xf32>
          %365 = "arith.truncf"(%364) : (vector<32x32xf32>) -> vector<32x32xf8E4M3FNUZ>
          "vector.transfer_write"(%365, %44, %35, %35, %35) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 3, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d1, d2)>}> : (vector<32x32xf8E4M3FNUZ>, memref<1x32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, index) -> ()
          %366 = "memref.expand_shape"(%44) <{reassociation = [[0, 1], [2], [3, 4]], static_output_shape = array<i64: 1, 1, 32, 1, 32>}> : (memref<1x32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>) -> memref<1x1x32x1x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>
          %367 = "memref.subview"(%366) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0, 0, 0, 0>, static_sizes = array<i64: 1, 1, 32, 1, 32>, static_strides = array<i64: 1, 1, 1, 1, 1>}> : (memref<1x1x32x1x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>) -> memref<32x32xf8E4M3FNUZ, strided<[32, 1]>, #gpu.address_space<workgroup>>
          %368:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
          %369:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 8, 8>}> : (index) -> (index, index, index)
          %370 = "affine.linearize_index"(%368#2, %35, %35, %369#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
          %371 = "affine.linearize_index"(%368#1, %35, %35, %369#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
          "vector.transfer_write"(%331, %42, %370, %371) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
          %372 = "affine.linearize_index"(%368#2, %30, %35, %369#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
          %373 = "affine.linearize_index"(%368#1, %35, %35, %369#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
          "vector.transfer_write"(%334, %42, %372, %373) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
          %374 = "affine.linearize_index"(%368#2, %13, %35, %369#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
          %375 = "affine.linearize_index"(%368#1, %35, %35, %369#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
          "vector.transfer_write"(%337, %42, %374, %375) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
          %376 = "affine.linearize_index"(%368#2, %12, %35, %369#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
          %377 = "affine.linearize_index"(%368#1, %35, %35, %369#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
          "vector.transfer_write"(%340, %42, %376, %377) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
          %378:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
          %379:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 8, 8>}> : (index) -> (index, index, index)
          %380 = "affine.linearize_index"(%378#2, %35, %35, %379#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
          %381 = "affine.linearize_index"(%378#1, %35, %35, %379#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
          "vector.transfer_write"(%345, %41, %380, %381) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
          %382 = "affine.linearize_index"(%378#2, %30, %35, %379#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
          %383 = "affine.linearize_index"(%378#1, %35, %35, %379#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
          "vector.transfer_write"(%348, %41, %382, %383) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
          %384 = "affine.linearize_index"(%378#2, %13, %35, %379#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
          %385 = "affine.linearize_index"(%378#1, %35, %35, %379#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
          "vector.transfer_write"(%351, %41, %384, %385) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
          %386 = "affine.linearize_index"(%378#2, %12, %35, %379#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 8, 1>}> : (index, index, index, index, index) -> index
          %387 = "affine.linearize_index"(%378#1, %35, %35, %379#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 8, 16>}> : (index, index, index, index, index) -> index
          "vector.transfer_write"(%354, %41, %386, %387) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<1x16xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
          %388:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
          %389:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index, index)
          %390 = "affine.linearize_index"(%388#2, %35, %35, %389#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
          %391 = "affine.linearize_index"(%388#1, %35, %35, %389#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %392 = "vector.transfer_read"(%367, %390, %391, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x32xf8E4M3FNUZ, strided<[32, 1]>, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<4x1xf8E4M3FNUZ>
          %393 = "vector.insert_strided_slice"(%392, %10) <{offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]}> : (vector<4x1xf8E4M3FNUZ>, vector<2x2x1x1x4x1xf8E4M3FNUZ>) -> vector<2x2x1x1x4x1xf8E4M3FNUZ>
          %394 = "affine.linearize_index"(%388#2, %35, %35, %389#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
          %395 = "affine.linearize_index"(%388#1, %30, %35, %389#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %396 = "vector.transfer_read"(%367, %394, %395, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x32xf8E4M3FNUZ, strided<[32, 1]>, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<4x1xf8E4M3FNUZ>
          %397 = "vector.insert_strided_slice"(%396, %393) <{offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]}> : (vector<4x1xf8E4M3FNUZ>, vector<2x2x1x1x4x1xf8E4M3FNUZ>) -> vector<2x2x1x1x4x1xf8E4M3FNUZ>
          %398 = "affine.linearize_index"(%388#2, %30, %35, %389#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
          %399 = "affine.linearize_index"(%388#1, %35, %35, %389#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %400 = "vector.transfer_read"(%367, %398, %399, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x32xf8E4M3FNUZ, strided<[32, 1]>, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<4x1xf8E4M3FNUZ>
          %401 = "vector.insert_strided_slice"(%400, %397) <{offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]}> : (vector<4x1xf8E4M3FNUZ>, vector<2x2x1x1x4x1xf8E4M3FNUZ>) -> vector<2x2x1x1x4x1xf8E4M3FNUZ>
          %402 = "affine.linearize_index"(%388#2, %30, %35, %389#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
          %403 = "affine.linearize_index"(%388#1, %30, %35, %389#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %404 = "vector.transfer_read"(%367, %402, %403, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x32xf8E4M3FNUZ, strided<[32, 1]>, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<4x1xf8E4M3FNUZ>
          %405 = "vector.insert_strided_slice"(%404, %401) <{offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]}> : (vector<4x1xf8E4M3FNUZ>, vector<2x2x1x1x4x1xf8E4M3FNUZ>) -> vector<2x2x1x1x4x1xf8E4M3FNUZ>
          %406 = "arith.extf"(%405) : (vector<2x2x1x1x4x1xf8E4M3FNUZ>) -> vector<2x2x1x1x4x1xf32>
          %407 = "arith.mulf"(%406, %18) <{fastmath = #arith.fastmath<none>}> : (vector<2x2x1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
          "gpu.barrier"() : () -> ()
          %408:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
          %409:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index, index)
          %410 = "affine.linearize_index"(%408#2, %35, %35, %409#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %411 = "affine.linearize_index"(%408#1, %35, %35, %409#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
          %412 = "vector.transfer_read"(%42, %410, %411, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
          %413 = "vector.insert_strided_slice"(%412, %11) <{offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
          %414 = "affine.linearize_index"(%408#2, %35, %35, %409#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %415 = "affine.linearize_index"(%408#1, %30, %35, %409#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
          %416 = "vector.transfer_read"(%42, %414, %415, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
          %417 = "vector.insert_strided_slice"(%416, %413) <{offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
          %418 = "affine.linearize_index"(%408#2, %35, %35, %409#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %419 = "affine.linearize_index"(%408#1, %13, %35, %409#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
          %420 = "vector.transfer_read"(%42, %418, %419, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
          %421 = "vector.insert_strided_slice"(%420, %417) <{offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
          %422 = "affine.linearize_index"(%408#2, %35, %35, %409#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %423 = "affine.linearize_index"(%408#1, %12, %35, %409#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
          %424 = "vector.transfer_read"(%42, %422, %423, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
          %425 = "vector.insert_strided_slice"(%424, %421) <{offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
          %426 = "affine.linearize_index"(%408#2, %30, %35, %409#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %427 = "affine.linearize_index"(%408#1, %35, %35, %409#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
          %428 = "vector.transfer_read"(%42, %426, %427, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
          %429 = "vector.insert_strided_slice"(%428, %425) <{offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
          %430 = "affine.linearize_index"(%408#2, %30, %35, %409#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %431 = "affine.linearize_index"(%408#1, %30, %35, %409#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
          %432 = "vector.transfer_read"(%42, %430, %431, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
          %433 = "vector.insert_strided_slice"(%432, %429) <{offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
          %434 = "affine.linearize_index"(%408#2, %30, %35, %409#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %435 = "affine.linearize_index"(%408#1, %13, %35, %409#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
          %436 = "vector.transfer_read"(%42, %434, %435, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
          %437 = "vector.insert_strided_slice"(%436, %433) <{offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
          %438 = "affine.linearize_index"(%408#2, %30, %35, %409#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %439 = "affine.linearize_index"(%408#1, %12, %35, %409#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 4, 1, 4, 8>}> : (index, index, index, index, index) -> index
          %440 = "vector.transfer_read"(%42, %438, %439, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
          %441 = "vector.insert_strided_slice"(%440, %437) <{offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<2x4x1x1x1x8xf8E4M3FNUZ>
          %442 = "vector.extract"(%24) <{static_position = array<i64: 0, 0>}> : (vector<2x2x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %443 = "vector.extract"(%162) <{static_position = array<i64: 0, 0>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %444 = "vector.extract"(%441) <{static_position = array<i64: 0, 0>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %445 = "vector.shape_cast"(%443) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %446 = "vector.shape_cast"(%444) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %447 = "vector.shape_cast"(%442) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %448 = "amdgpu.mfma"(%445, %446, %447) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %449 = "vector.extract"(%162) <{static_position = array<i64: 0, 1>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %450 = "vector.extract"(%441) <{static_position = array<i64: 0, 1>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %451 = "vector.shape_cast"(%449) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %452 = "vector.shape_cast"(%450) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %453 = "amdgpu.mfma"(%451, %452, %448) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %454 = "vector.extract"(%162) <{static_position = array<i64: 0, 2>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %455 = "vector.extract"(%441) <{static_position = array<i64: 0, 2>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %456 = "vector.shape_cast"(%454) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %457 = "vector.shape_cast"(%455) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %458 = "amdgpu.mfma"(%456, %457, %453) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %459 = "vector.extract"(%162) <{static_position = array<i64: 0, 3>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %460 = "vector.extract"(%441) <{static_position = array<i64: 0, 3>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %461 = "vector.shape_cast"(%459) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %462 = "vector.shape_cast"(%460) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %463 = "amdgpu.mfma"(%461, %462, %458) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %464 = "vector.shape_cast"(%463) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %465 = "vector.insert"(%464, %24) <{static_position = array<i64: 0, 0>}> : (vector<1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
          %466 = "vector.extract"(%24) <{static_position = array<i64: 0, 1>}> : (vector<2x2x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %467 = "vector.extract"(%162) <{static_position = array<i64: 0, 0>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %468 = "vector.extract"(%441) <{static_position = array<i64: 1, 0>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %469 = "vector.shape_cast"(%467) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %470 = "vector.shape_cast"(%468) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %471 = "vector.shape_cast"(%466) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %472 = "amdgpu.mfma"(%469, %470, %471) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %473 = "vector.extract"(%162) <{static_position = array<i64: 0, 1>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %474 = "vector.extract"(%441) <{static_position = array<i64: 1, 1>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %475 = "vector.shape_cast"(%473) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %476 = "vector.shape_cast"(%474) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %477 = "amdgpu.mfma"(%475, %476, %472) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %478 = "vector.extract"(%162) <{static_position = array<i64: 0, 2>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %479 = "vector.extract"(%441) <{static_position = array<i64: 1, 2>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %480 = "vector.shape_cast"(%478) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %481 = "vector.shape_cast"(%479) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %482 = "amdgpu.mfma"(%480, %481, %477) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %483 = "vector.extract"(%162) <{static_position = array<i64: 0, 3>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %484 = "vector.extract"(%441) <{static_position = array<i64: 1, 3>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %485 = "vector.shape_cast"(%483) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %486 = "vector.shape_cast"(%484) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %487 = "amdgpu.mfma"(%485, %486, %482) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %488 = "vector.shape_cast"(%487) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %489 = "vector.insert"(%488, %465) <{static_position = array<i64: 0, 1>}> : (vector<1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
          %490 = "vector.extract"(%24) <{static_position = array<i64: 1, 0>}> : (vector<2x2x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %491 = "vector.extract"(%162) <{static_position = array<i64: 1, 0>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %492 = "vector.extract"(%441) <{static_position = array<i64: 0, 0>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %493 = "vector.shape_cast"(%491) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %494 = "vector.shape_cast"(%492) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %495 = "vector.shape_cast"(%490) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %496 = "amdgpu.mfma"(%493, %494, %495) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %497 = "vector.extract"(%162) <{static_position = array<i64: 1, 1>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %498 = "vector.extract"(%441) <{static_position = array<i64: 0, 1>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %499 = "vector.shape_cast"(%497) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %500 = "vector.shape_cast"(%498) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %501 = "amdgpu.mfma"(%499, %500, %496) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %502 = "vector.extract"(%162) <{static_position = array<i64: 1, 2>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %503 = "vector.extract"(%441) <{static_position = array<i64: 0, 2>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %504 = "vector.shape_cast"(%502) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %505 = "vector.shape_cast"(%503) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %506 = "amdgpu.mfma"(%504, %505, %501) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %507 = "vector.extract"(%162) <{static_position = array<i64: 1, 3>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %508 = "vector.extract"(%441) <{static_position = array<i64: 0, 3>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %509 = "vector.shape_cast"(%507) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %510 = "vector.shape_cast"(%508) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %511 = "amdgpu.mfma"(%509, %510, %506) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %512 = "vector.shape_cast"(%511) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %513 = "vector.insert"(%512, %489) <{static_position = array<i64: 1, 0>}> : (vector<1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
          %514 = "vector.extract"(%24) <{static_position = array<i64: 1, 1>}> : (vector<2x2x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %515 = "vector.extract"(%162) <{static_position = array<i64: 1, 0>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %516 = "vector.extract"(%441) <{static_position = array<i64: 1, 0>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %517 = "vector.shape_cast"(%515) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %518 = "vector.shape_cast"(%516) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %519 = "vector.shape_cast"(%514) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %520 = "amdgpu.mfma"(%517, %518, %519) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %521 = "vector.extract"(%162) <{static_position = array<i64: 1, 1>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %522 = "vector.extract"(%441) <{static_position = array<i64: 1, 1>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %523 = "vector.shape_cast"(%521) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %524 = "vector.shape_cast"(%522) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %525 = "amdgpu.mfma"(%523, %524, %520) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %526 = "vector.extract"(%162) <{static_position = array<i64: 1, 2>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %527 = "vector.extract"(%441) <{static_position = array<i64: 1, 2>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %528 = "vector.shape_cast"(%526) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %529 = "vector.shape_cast"(%527) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %530 = "amdgpu.mfma"(%528, %529, %525) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %531 = "vector.extract"(%162) <{static_position = array<i64: 1, 3>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %532 = "vector.extract"(%441) <{static_position = array<i64: 1, 3>}> : (vector<2x4x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %533 = "vector.shape_cast"(%531) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %534 = "vector.shape_cast"(%532) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %535 = "amdgpu.mfma"(%533, %534, %530) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %536 = "vector.shape_cast"(%535) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %537 = "vector.insert"(%536, %513) <{static_position = array<i64: 1, 1>}> : (vector<1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
          %538 = "arith.mulf"(%167, %537) <{fastmath = #arith.fastmath<none>}> : (vector<2x2x1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
          %539 = "arith.addf"(%538, %19) <{fastmath = #arith.fastmath<none>}> : (vector<2x2x1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
          %540 = "arith.addf"(%539, %407) <{fastmath = #arith.fastmath<none>}> : (vector<2x2x1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
          %541 = "vector.multi_reduction"(%540, %9) <{kind = #vector.kind<maximumf>, reduction_dims = array<i64: 1, 3, 5>}> : (vector<2x2x1x1x4x1xf32>, vector<2x1x4xf32>) -> vector<2x1x4xf32>
          %542 = "vector.extract"(%541) <{static_position = array<i64: 0, 0, 0>}> : (vector<2x1x4xf32>) -> f32
          %543 = "gpu.subgroup_reduce"(%542) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op maximumf>}> : (f32) -> f32
          %544 = "vector.insert"(%543, %8) <{static_position = array<i64: 0>}> : (f32, vector<8xf32>) -> vector<8xf32>
          %545 = "vector.extract"(%541) <{static_position = array<i64: 0, 0, 1>}> : (vector<2x1x4xf32>) -> f32
          %546 = "gpu.subgroup_reduce"(%545) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op maximumf>}> : (f32) -> f32
          %547 = "vector.insert"(%546, %544) <{static_position = array<i64: 1>}> : (f32, vector<8xf32>) -> vector<8xf32>
          %548 = "vector.extract"(%541) <{static_position = array<i64: 0, 0, 2>}> : (vector<2x1x4xf32>) -> f32
          %549 = "gpu.subgroup_reduce"(%548) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op maximumf>}> : (f32) -> f32
          %550 = "vector.insert"(%549, %547) <{static_position = array<i64: 2>}> : (f32, vector<8xf32>) -> vector<8xf32>
          %551 = "vector.extract"(%541) <{static_position = array<i64: 0, 0, 3>}> : (vector<2x1x4xf32>) -> f32
          %552 = "gpu.subgroup_reduce"(%551) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op maximumf>}> : (f32) -> f32
          %553 = "vector.insert"(%552, %550) <{static_position = array<i64: 3>}> : (f32, vector<8xf32>) -> vector<8xf32>
          %554 = "vector.extract"(%541) <{static_position = array<i64: 1, 0, 0>}> : (vector<2x1x4xf32>) -> f32
          %555 = "gpu.subgroup_reduce"(%554) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op maximumf>}> : (f32) -> f32
          %556 = "vector.insert"(%555, %553) <{static_position = array<i64: 4>}> : (f32, vector<8xf32>) -> vector<8xf32>
          %557 = "vector.extract"(%541) <{static_position = array<i64: 1, 0, 1>}> : (vector<2x1x4xf32>) -> f32
          %558 = "gpu.subgroup_reduce"(%557) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op maximumf>}> : (f32) -> f32
          %559 = "vector.insert"(%558, %556) <{static_position = array<i64: 5>}> : (f32, vector<8xf32>) -> vector<8xf32>
          %560 = "vector.extract"(%541) <{static_position = array<i64: 1, 0, 2>}> : (vector<2x1x4xf32>) -> f32
          %561 = "gpu.subgroup_reduce"(%560) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op maximumf>}> : (f32) -> f32
          %562 = "vector.insert"(%561, %559) <{static_position = array<i64: 6>}> : (f32, vector<8xf32>) -> vector<8xf32>
          %563 = "vector.extract"(%541) <{static_position = array<i64: 1, 0, 3>}> : (vector<2x1x4xf32>) -> f32
          %564 = "gpu.subgroup_reduce"(%563) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op maximumf>}> : (f32) -> f32
          %565 = "vector.insert"(%564, %562) <{static_position = array<i64: 7>}> : (f32, vector<8xf32>) -> vector<8xf32>
          %566 = "vector.shape_cast"(%565) : (vector<8xf32>) -> vector<2x1x4xf32>
          %567 = "arith.maximumf"(%566, %arg4) <{fastmath = #arith.fastmath<none>}> : (vector<2x1x4xf32>, vector<2x1x4xf32>) -> vector<2x1x4xf32>
          %568 = "arith.subf"(%arg4, %567) <{fastmath = #arith.fastmath<none>}> : (vector<2x1x4xf32>, vector<2x1x4xf32>) -> vector<2x1x4xf32>
          %569 = "math.exp2"(%568) <{fastmath = #arith.fastmath<none>}> : (vector<2x1x4xf32>) -> vector<2x1x4xf32>
          %570 = "arith.mulf"(%569, %arg5) <{fastmath = #arith.fastmath<none>}> : (vector<2x1x4xf32>, vector<2x1x4xf32>) -> vector<2x1x4xf32>
          %571 = "vector.extract"(%567) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %572 = "vector.broadcast"(%571) : (vector<4xf32>) -> vector<1x4xf32>
          %573 = "vector.insert"(%572, %7) <{static_position = array<i64: 0, 0, 0, 0>}> : (vector<1x4xf32>, vector<2x2x1x1x1x4xf32>) -> vector<2x2x1x1x1x4xf32>
          %574 = "vector.extract"(%567) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %575 = "vector.broadcast"(%574) : (vector<4xf32>) -> vector<1x4xf32>
          %576 = "vector.insert"(%575, %573) <{static_position = array<i64: 0, 1, 0, 0>}> : (vector<1x4xf32>, vector<2x2x1x1x1x4xf32>) -> vector<2x2x1x1x1x4xf32>
          %577 = "vector.extract"(%567) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %578 = "vector.broadcast"(%577) : (vector<4xf32>) -> vector<1x4xf32>
          %579 = "vector.insert"(%578, %576) <{static_position = array<i64: 1, 0, 0, 0>}> : (vector<1x4xf32>, vector<2x2x1x1x1x4xf32>) -> vector<2x2x1x1x1x4xf32>
          %580 = "vector.extract"(%567) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %581 = "vector.broadcast"(%580) : (vector<4xf32>) -> vector<1x4xf32>
          %582 = "vector.insert"(%581, %579) <{static_position = array<i64: 1, 1, 0, 0>}> : (vector<1x4xf32>, vector<2x2x1x1x1x4xf32>) -> vector<2x2x1x1x1x4xf32>
          %583 = "vector.transpose"(%582) <{permutation = array<i64: 1, 0, 3, 2, 5, 4>}> : (vector<2x2x1x1x1x4xf32>) -> vector<2x2x1x1x4x1xf32>
          %584 = "arith.subf"(%540, %583) <{fastmath = #arith.fastmath<none>}> : (vector<2x2x1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
          %585 = "math.exp2"(%584) <{fastmath = #arith.fastmath<none>}> : (vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
          %586 = "vector.multi_reduction"(%585, %25) <{kind = #vector.kind<add>, reduction_dims = array<i64: 1, 3, 5>}> : (vector<2x2x1x1x4x1xf32>, vector<2x1x4xf32>) -> vector<2x1x4xf32>
          %587 = "vector.extract"(%586) <{static_position = array<i64: 0, 0, 0>}> : (vector<2x1x4xf32>) -> f32
          %588 = "gpu.subgroup_reduce"(%587) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op add>}> : (f32) -> f32
          %589 = "vector.insert"(%588, %8) <{static_position = array<i64: 0>}> : (f32, vector<8xf32>) -> vector<8xf32>
          %590 = "vector.extract"(%586) <{static_position = array<i64: 0, 0, 1>}> : (vector<2x1x4xf32>) -> f32
          %591 = "gpu.subgroup_reduce"(%590) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op add>}> : (f32) -> f32
          %592 = "vector.insert"(%591, %589) <{static_position = array<i64: 1>}> : (f32, vector<8xf32>) -> vector<8xf32>
          %593 = "vector.extract"(%586) <{static_position = array<i64: 0, 0, 2>}> : (vector<2x1x4xf32>) -> f32
          %594 = "gpu.subgroup_reduce"(%593) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op add>}> : (f32) -> f32
          %595 = "vector.insert"(%594, %592) <{static_position = array<i64: 2>}> : (f32, vector<8xf32>) -> vector<8xf32>
          %596 = "vector.extract"(%586) <{static_position = array<i64: 0, 0, 3>}> : (vector<2x1x4xf32>) -> f32
          %597 = "gpu.subgroup_reduce"(%596) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op add>}> : (f32) -> f32
          %598 = "vector.insert"(%597, %595) <{static_position = array<i64: 3>}> : (f32, vector<8xf32>) -> vector<8xf32>
          %599 = "vector.extract"(%586) <{static_position = array<i64: 1, 0, 0>}> : (vector<2x1x4xf32>) -> f32
          %600 = "gpu.subgroup_reduce"(%599) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op add>}> : (f32) -> f32
          %601 = "vector.insert"(%600, %598) <{static_position = array<i64: 4>}> : (f32, vector<8xf32>) -> vector<8xf32>
          %602 = "vector.extract"(%586) <{static_position = array<i64: 1, 0, 1>}> : (vector<2x1x4xf32>) -> f32
          %603 = "gpu.subgroup_reduce"(%602) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op add>}> : (f32) -> f32
          %604 = "vector.insert"(%603, %601) <{static_position = array<i64: 5>}> : (f32, vector<8xf32>) -> vector<8xf32>
          %605 = "vector.extract"(%586) <{static_position = array<i64: 1, 0, 2>}> : (vector<2x1x4xf32>) -> f32
          %606 = "gpu.subgroup_reduce"(%605) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op add>}> : (f32) -> f32
          %607 = "vector.insert"(%606, %604) <{static_position = array<i64: 6>}> : (f32, vector<8xf32>) -> vector<8xf32>
          %608 = "vector.extract"(%586) <{static_position = array<i64: 1, 0, 3>}> : (vector<2x1x4xf32>) -> f32
          %609 = "gpu.subgroup_reduce"(%608) <{cluster_size = 16 : i32, cluster_stride = 1 : i32, op = #gpu<all_reduce_op add>}> : (f32) -> f32
          %610 = "vector.insert"(%609, %607) <{static_position = array<i64: 7>}> : (f32, vector<8xf32>) -> vector<8xf32>
          %611 = "vector.shape_cast"(%610) : (vector<8xf32>) -> vector<2x1x4xf32>
          %612 = "arith.addf"(%611, %570) <{fastmath = #arith.fastmath<none>}> : (vector<2x1x4xf32>, vector<2x1x4xf32>) -> vector<2x1x4xf32>
          %613 = "arith.minimumf"(%585, %17) <{fastmath = #arith.fastmath<none>}> : (vector<2x2x1x1x4x1xf32>, vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32>
          %614 = "arith.truncf"(%613) : (vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf8E4M3FNUZ>
          %615 = "vector.extract"(%569) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %616 = "vector.broadcast"(%615) : (vector<4xf32>) -> vector<1x4xf32>
          %617 = "vector.insert"(%616, %6) <{static_position = array<i64: 0, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
          %618 = "vector.extract"(%569) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %619 = "vector.broadcast"(%618) : (vector<4xf32>) -> vector<1x4xf32>
          %620 = "vector.insert"(%619, %617) <{static_position = array<i64: 0, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
          %621 = "vector.extract"(%569) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %622 = "vector.broadcast"(%621) : (vector<4xf32>) -> vector<1x4xf32>
          %623 = "vector.insert"(%622, %620) <{static_position = array<i64: 1, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
          %624 = "vector.extract"(%569) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %625 = "vector.broadcast"(%624) : (vector<4xf32>) -> vector<1x4xf32>
          %626 = "vector.insert"(%625, %623) <{static_position = array<i64: 1, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
          %627 = "vector.extract"(%569) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %628 = "vector.broadcast"(%627) : (vector<4xf32>) -> vector<1x4xf32>
          %629 = "vector.insert"(%628, %626) <{static_position = array<i64: 2, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
          %630 = "vector.extract"(%569) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %631 = "vector.broadcast"(%630) : (vector<4xf32>) -> vector<1x4xf32>
          %632 = "vector.insert"(%631, %629) <{static_position = array<i64: 2, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
          %633 = "vector.extract"(%569) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %634 = "vector.broadcast"(%633) : (vector<4xf32>) -> vector<1x4xf32>
          %635 = "vector.insert"(%634, %632) <{static_position = array<i64: 3, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
          %636 = "vector.extract"(%569) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %637 = "vector.broadcast"(%636) : (vector<4xf32>) -> vector<1x4xf32>
          %638 = "vector.insert"(%637, %635) <{static_position = array<i64: 3, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
          %639 = "vector.extract"(%569) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %640 = "vector.broadcast"(%639) : (vector<4xf32>) -> vector<1x4xf32>
          %641 = "vector.insert"(%640, %638) <{static_position = array<i64: 4, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
          %642 = "vector.extract"(%569) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %643 = "vector.broadcast"(%642) : (vector<4xf32>) -> vector<1x4xf32>
          %644 = "vector.insert"(%643, %641) <{static_position = array<i64: 4, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
          %645 = "vector.extract"(%569) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %646 = "vector.broadcast"(%645) : (vector<4xf32>) -> vector<1x4xf32>
          %647 = "vector.insert"(%646, %644) <{static_position = array<i64: 5, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
          %648 = "vector.extract"(%569) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %649 = "vector.broadcast"(%648) : (vector<4xf32>) -> vector<1x4xf32>
          %650 = "vector.insert"(%649, %647) <{static_position = array<i64: 5, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
          %651 = "vector.extract"(%569) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %652 = "vector.broadcast"(%651) : (vector<4xf32>) -> vector<1x4xf32>
          %653 = "vector.insert"(%652, %650) <{static_position = array<i64: 6, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
          %654 = "vector.extract"(%569) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %655 = "vector.broadcast"(%654) : (vector<4xf32>) -> vector<1x4xf32>
          %656 = "vector.insert"(%655, %653) <{static_position = array<i64: 6, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
          %657 = "vector.extract"(%569) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %658 = "vector.broadcast"(%657) : (vector<4xf32>) -> vector<1x4xf32>
          %659 = "vector.insert"(%658, %656) <{static_position = array<i64: 7, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
          %660 = "vector.extract"(%569) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
          %661 = "vector.broadcast"(%660) : (vector<4xf32>) -> vector<1x4xf32>
          %662 = "vector.insert"(%661, %659) <{static_position = array<i64: 7, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
          %663 = "vector.transpose"(%662) <{permutation = array<i64: 1, 0, 3, 2, 5, 4>}> : (vector<8x2x1x1x1x4xf32>) -> vector<2x8x1x1x4x1xf32>
          %664 = "arith.mulf"(%663, %arg6) <{fastmath = #arith.fastmath<none>}> : (vector<2x8x1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
          %665:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
          %666:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index, index)
          %667 = "affine.linearize_index"(%665#2, %35, %35, %666#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
          %668 = "affine.linearize_index"(%665#1, %35, %35, %666#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %669 = "vector.extract"(%614) <{static_position = array<i64: 0, 0, 0, 0>}> : (vector<2x2x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
          "vector.transfer_write"(%669, %40, %667, %668) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
          %670 = "affine.linearize_index"(%665#2, %35, %35, %666#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
          %671 = "affine.linearize_index"(%665#1, %30, %35, %666#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %672 = "vector.extract"(%614) <{static_position = array<i64: 0, 1, 0, 0>}> : (vector<2x2x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
          "vector.transfer_write"(%672, %40, %670, %671) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
          %673 = "affine.linearize_index"(%665#2, %30, %35, %666#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
          %674 = "affine.linearize_index"(%665#1, %35, %35, %666#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %675 = "vector.extract"(%614) <{static_position = array<i64: 1, 0, 0, 0>}> : (vector<2x2x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
          "vector.transfer_write"(%675, %40, %673, %674) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
          %676 = "affine.linearize_index"(%665#2, %30, %35, %666#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
          %677 = "affine.linearize_index"(%665#1, %30, %35, %666#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %678 = "vector.extract"(%614) <{static_position = array<i64: 1, 1, 0, 0>}> : (vector<2x2x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
          "vector.transfer_write"(%678, %40, %676, %677) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index) -> ()
          "gpu.barrier"() : () -> ()
          %679:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
          %680:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index, index)
          %681 = "affine.linearize_index"(%679#2, %35, %35, %680#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %682 = "affine.linearize_index"(%679#1, %35, %35, %680#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
          %683 = "vector.transfer_read"(%40, %681, %682, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
          %684 = "vector.insert_strided_slice"(%683, %5) <{offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<2x1x1x1x1x8xf8E4M3FNUZ>
          %685 = "affine.linearize_index"(%679#2, %30, %35, %680#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %686 = "affine.linearize_index"(%679#1, %35, %35, %680#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
          %687 = "vector.transfer_read"(%40, %685, %686, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<1x8xf8E4M3FNUZ>
          %688 = "vector.insert_strided_slice"(%687, %684) <{offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]}> : (vector<1x8xf8E4M3FNUZ>, vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<2x1x1x1x1x8xf8E4M3FNUZ>
          %689:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
          %690:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index, index)
          %691 = "affine.linearize_index"(%689#2, %35, %35, %690#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
          %692 = "affine.linearize_index"(%689#1, %35, %35, %690#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %693 = "vector.transfer_read"(%41, %691, %692, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<8x1xf8E4M3FNUZ>
          %694 = "vector.insert_strided_slice"(%693, %4) <{offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]}> : (vector<8x1xf8E4M3FNUZ>, vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x8x1x1x8x1xf8E4M3FNUZ>
          %695 = "affine.linearize_index"(%689#2, %35, %35, %690#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
          %696 = "affine.linearize_index"(%689#1, %30, %35, %690#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %697 = "vector.transfer_read"(%41, %695, %696, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<8x1xf8E4M3FNUZ>
          %698 = "vector.insert_strided_slice"(%697, %694) <{offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]}> : (vector<8x1xf8E4M3FNUZ>, vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x8x1x1x8x1xf8E4M3FNUZ>
          %699 = "affine.linearize_index"(%689#2, %35, %35, %690#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
          %700 = "affine.linearize_index"(%689#1, %13, %35, %690#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %701 = "vector.transfer_read"(%41, %699, %700, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<8x1xf8E4M3FNUZ>
          %702 = "vector.insert_strided_slice"(%701, %698) <{offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]}> : (vector<8x1xf8E4M3FNUZ>, vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x8x1x1x8x1xf8E4M3FNUZ>
          %703 = "affine.linearize_index"(%689#2, %35, %35, %690#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
          %704 = "affine.linearize_index"(%689#1, %12, %35, %690#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %705 = "vector.transfer_read"(%41, %703, %704, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<8x1xf8E4M3FNUZ>
          %706 = "vector.insert_strided_slice"(%705, %702) <{offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]}> : (vector<8x1xf8E4M3FNUZ>, vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x8x1x1x8x1xf8E4M3FNUZ>
          %707 = "affine.linearize_index"(%689#2, %35, %35, %690#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
          %708 = "affine.linearize_index"(%689#1, %3, %35, %690#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %709 = "vector.transfer_read"(%41, %707, %708, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<8x1xf8E4M3FNUZ>
          %710 = "vector.insert_strided_slice"(%709, %706) <{offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]}> : (vector<8x1xf8E4M3FNUZ>, vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x8x1x1x8x1xf8E4M3FNUZ>
          %711 = "affine.linearize_index"(%689#2, %35, %35, %690#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
          %712 = "affine.linearize_index"(%689#1, %2, %35, %690#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %713 = "vector.transfer_read"(%41, %711, %712, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<8x1xf8E4M3FNUZ>
          %714 = "vector.insert_strided_slice"(%713, %710) <{offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]}> : (vector<8x1xf8E4M3FNUZ>, vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x8x1x1x8x1xf8E4M3FNUZ>
          %715 = "affine.linearize_index"(%689#2, %35, %35, %690#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
          %716 = "affine.linearize_index"(%689#1, %1, %35, %690#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %717 = "vector.transfer_read"(%41, %715, %716, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<8x1xf8E4M3FNUZ>
          %718 = "vector.insert_strided_slice"(%717, %714) <{offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]}> : (vector<8x1xf8E4M3FNUZ>, vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x8x1x1x8x1xf8E4M3FNUZ>
          %719 = "affine.linearize_index"(%689#2, %35, %35, %690#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 1, 1, 4, 8>}> : (index, index, index, index, index) -> index
          %720 = "affine.linearize_index"(%689#1, %0, %35, %690#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
          %721 = "vector.transfer_read"(%41, %719, %720, %28) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>, index, index, f8E4M3FNUZ) -> vector<8x1xf8E4M3FNUZ>
          %722 = "vector.insert_strided_slice"(%721, %718) <{offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]}> : (vector<8x1xf8E4M3FNUZ>, vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x8x1x1x8x1xf8E4M3FNUZ>
          %723 = "vector.extract"(%664) <{static_position = array<i64: 0, 0>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %724 = "vector.extract"(%688) <{static_position = array<i64: 0, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %725 = "vector.extract"(%722) <{static_position = array<i64: 0, 0>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
          %726 = "vector.shape_cast"(%724) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %727 = "vector.shape_cast"(%725) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %728 = "vector.shape_cast"(%723) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %729 = "amdgpu.mfma"(%726, %727, %728) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %730 = "vector.shape_cast"(%729) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %731 = "vector.insert"(%730, %27) <{static_position = array<i64: 0, 0>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
          %732 = "vector.extract"(%664) <{static_position = array<i64: 0, 1>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %733 = "vector.extract"(%688) <{static_position = array<i64: 0, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %734 = "vector.extract"(%722) <{static_position = array<i64: 0, 1>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
          %735 = "vector.shape_cast"(%733) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %736 = "vector.shape_cast"(%734) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %737 = "vector.shape_cast"(%732) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %738 = "amdgpu.mfma"(%735, %736, %737) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %739 = "vector.shape_cast"(%738) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %740 = "vector.insert"(%739, %731) <{static_position = array<i64: 0, 1>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
          %741 = "vector.extract"(%664) <{static_position = array<i64: 0, 2>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %742 = "vector.extract"(%688) <{static_position = array<i64: 0, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %743 = "vector.extract"(%722) <{static_position = array<i64: 0, 2>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
          %744 = "vector.shape_cast"(%742) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %745 = "vector.shape_cast"(%743) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %746 = "vector.shape_cast"(%741) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %747 = "amdgpu.mfma"(%744, %745, %746) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %748 = "vector.shape_cast"(%747) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %749 = "vector.insert"(%748, %740) <{static_position = array<i64: 0, 2>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
          %750 = "vector.extract"(%664) <{static_position = array<i64: 0, 3>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %751 = "vector.extract"(%688) <{static_position = array<i64: 0, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %752 = "vector.extract"(%722) <{static_position = array<i64: 0, 3>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
          %753 = "vector.shape_cast"(%751) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %754 = "vector.shape_cast"(%752) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %755 = "vector.shape_cast"(%750) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %756 = "amdgpu.mfma"(%753, %754, %755) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %757 = "vector.shape_cast"(%756) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %758 = "vector.insert"(%757, %749) <{static_position = array<i64: 0, 3>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
          %759 = "vector.extract"(%664) <{static_position = array<i64: 0, 4>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %760 = "vector.extract"(%688) <{static_position = array<i64: 0, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %761 = "vector.extract"(%722) <{static_position = array<i64: 0, 4>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
          %762 = "vector.shape_cast"(%760) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %763 = "vector.shape_cast"(%761) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %764 = "vector.shape_cast"(%759) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %765 = "amdgpu.mfma"(%762, %763, %764) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %766 = "vector.shape_cast"(%765) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %767 = "vector.insert"(%766, %758) <{static_position = array<i64: 0, 4>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
          %768 = "vector.extract"(%664) <{static_position = array<i64: 0, 5>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %769 = "vector.extract"(%688) <{static_position = array<i64: 0, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %770 = "vector.extract"(%722) <{static_position = array<i64: 0, 5>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
          %771 = "vector.shape_cast"(%769) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %772 = "vector.shape_cast"(%770) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %773 = "vector.shape_cast"(%768) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %774 = "amdgpu.mfma"(%771, %772, %773) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %775 = "vector.shape_cast"(%774) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %776 = "vector.insert"(%775, %767) <{static_position = array<i64: 0, 5>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
          %777 = "vector.extract"(%664) <{static_position = array<i64: 0, 6>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %778 = "vector.extract"(%688) <{static_position = array<i64: 0, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %779 = "vector.extract"(%722) <{static_position = array<i64: 0, 6>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
          %780 = "vector.shape_cast"(%778) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %781 = "vector.shape_cast"(%779) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %782 = "vector.shape_cast"(%777) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %783 = "amdgpu.mfma"(%780, %781, %782) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %784 = "vector.shape_cast"(%783) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %785 = "vector.insert"(%784, %776) <{static_position = array<i64: 0, 6>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
          %786 = "vector.extract"(%664) <{static_position = array<i64: 0, 7>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %787 = "vector.extract"(%688) <{static_position = array<i64: 0, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %788 = "vector.extract"(%722) <{static_position = array<i64: 0, 7>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
          %789 = "vector.shape_cast"(%787) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %790 = "vector.shape_cast"(%788) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %791 = "vector.shape_cast"(%786) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %792 = "amdgpu.mfma"(%789, %790, %791) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %793 = "vector.shape_cast"(%792) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %794 = "vector.insert"(%793, %785) <{static_position = array<i64: 0, 7>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
          %795 = "vector.extract"(%664) <{static_position = array<i64: 1, 0>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %796 = "vector.extract"(%688) <{static_position = array<i64: 1, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %797 = "vector.extract"(%722) <{static_position = array<i64: 0, 0>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
          %798 = "vector.shape_cast"(%796) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %799 = "vector.shape_cast"(%797) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %800 = "vector.shape_cast"(%795) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %801 = "amdgpu.mfma"(%798, %799, %800) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %802 = "vector.shape_cast"(%801) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %803 = "vector.insert"(%802, %794) <{static_position = array<i64: 1, 0>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
          %804 = "vector.extract"(%664) <{static_position = array<i64: 1, 1>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %805 = "vector.extract"(%688) <{static_position = array<i64: 1, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %806 = "vector.extract"(%722) <{static_position = array<i64: 0, 1>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
          %807 = "vector.shape_cast"(%805) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %808 = "vector.shape_cast"(%806) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %809 = "vector.shape_cast"(%804) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %810 = "amdgpu.mfma"(%807, %808, %809) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %811 = "vector.shape_cast"(%810) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %812 = "vector.insert"(%811, %803) <{static_position = array<i64: 1, 1>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
          %813 = "vector.extract"(%664) <{static_position = array<i64: 1, 2>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %814 = "vector.extract"(%688) <{static_position = array<i64: 1, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %815 = "vector.extract"(%722) <{static_position = array<i64: 0, 2>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
          %816 = "vector.shape_cast"(%814) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %817 = "vector.shape_cast"(%815) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %818 = "vector.shape_cast"(%813) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %819 = "amdgpu.mfma"(%816, %817, %818) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %820 = "vector.shape_cast"(%819) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %821 = "vector.insert"(%820, %812) <{static_position = array<i64: 1, 2>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
          %822 = "vector.extract"(%664) <{static_position = array<i64: 1, 3>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %823 = "vector.extract"(%688) <{static_position = array<i64: 1, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %824 = "vector.extract"(%722) <{static_position = array<i64: 0, 3>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
          %825 = "vector.shape_cast"(%823) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %826 = "vector.shape_cast"(%824) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %827 = "vector.shape_cast"(%822) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %828 = "amdgpu.mfma"(%825, %826, %827) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %829 = "vector.shape_cast"(%828) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %830 = "vector.insert"(%829, %821) <{static_position = array<i64: 1, 3>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
          %831 = "vector.extract"(%664) <{static_position = array<i64: 1, 4>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %832 = "vector.extract"(%688) <{static_position = array<i64: 1, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %833 = "vector.extract"(%722) <{static_position = array<i64: 0, 4>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
          %834 = "vector.shape_cast"(%832) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %835 = "vector.shape_cast"(%833) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %836 = "vector.shape_cast"(%831) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %837 = "amdgpu.mfma"(%834, %835, %836) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %838 = "vector.shape_cast"(%837) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %839 = "vector.insert"(%838, %830) <{static_position = array<i64: 1, 4>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
          %840 = "vector.extract"(%664) <{static_position = array<i64: 1, 5>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %841 = "vector.extract"(%688) <{static_position = array<i64: 1, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %842 = "vector.extract"(%722) <{static_position = array<i64: 0, 5>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
          %843 = "vector.shape_cast"(%841) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %844 = "vector.shape_cast"(%842) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %845 = "vector.shape_cast"(%840) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %846 = "amdgpu.mfma"(%843, %844, %845) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %847 = "vector.shape_cast"(%846) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %848 = "vector.insert"(%847, %839) <{static_position = array<i64: 1, 5>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
          %849 = "vector.extract"(%664) <{static_position = array<i64: 1, 6>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %850 = "vector.extract"(%688) <{static_position = array<i64: 1, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %851 = "vector.extract"(%722) <{static_position = array<i64: 0, 6>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
          %852 = "vector.shape_cast"(%850) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %853 = "vector.shape_cast"(%851) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %854 = "vector.shape_cast"(%849) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %855 = "amdgpu.mfma"(%852, %853, %854) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %856 = "vector.shape_cast"(%855) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %857 = "vector.insert"(%856, %848) <{static_position = array<i64: 1, 6>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
          %858 = "vector.extract"(%664) <{static_position = array<i64: 1, 7>}> : (vector<2x8x1x1x4x1xf32>) -> vector<1x1x4x1xf32>
          %859 = "vector.extract"(%688) <{static_position = array<i64: 1, 0>}> : (vector<2x1x1x1x1x8xf8E4M3FNUZ>) -> vector<1x1x1x8xf8E4M3FNUZ>
          %860 = "vector.extract"(%722) <{static_position = array<i64: 0, 7>}> : (vector<1x8x1x1x8x1xf8E4M3FNUZ>) -> vector<1x1x8x1xf8E4M3FNUZ>
          %861 = "vector.shape_cast"(%859) : (vector<1x1x1x8xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %862 = "vector.shape_cast"(%860) : (vector<1x1x8x1xf8E4M3FNUZ>) -> vector<8xf8E4M3FNUZ>
          %863 = "vector.shape_cast"(%858) : (vector<1x1x4x1xf32>) -> vector<4xf32>
          %864 = "amdgpu.mfma"(%861, %862, %863) <{abid = 0 : i32, blgp = #amdgpu<mfma_perm_b none>, blocks = 1 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32}> : (vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>) -> vector<4xf32>
          %865 = "vector.shape_cast"(%864) : (vector<4xf32>) -> vector<1x1x4x1xf32>
          %866 = "vector.insert"(%865, %857) <{static_position = array<i64: 1, 7>}> : (vector<1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
          "scf.yield"(%567, %612, %866) : (vector<2x1x4xf32>, vector<2x1x4xf32>, vector<2x8x1x1x4x1xf32>) -> ()
        }) : (index, index, index, vector<2x1x4xf32>, vector<2x1x4xf32>, vector<2x8x1x1x4x1xf32>) -> (vector<2x1x4xf32>, vector<2x1x4xf32>, vector<2x8x1x1x4x1xf32>)
        %169 = "vector.extract"(%168#1) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
        %170 = "vector.broadcast"(%169) : (vector<4xf32>) -> vector<1x4xf32>
        %171 = "vector.insert"(%170, %6) <{static_position = array<i64: 0, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
        %172 = "vector.extract"(%168#1) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
        %173 = "vector.broadcast"(%172) : (vector<4xf32>) -> vector<1x4xf32>
        %174 = "vector.insert"(%173, %171) <{static_position = array<i64: 0, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
        %175 = "vector.extract"(%168#1) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
        %176 = "vector.broadcast"(%175) : (vector<4xf32>) -> vector<1x4xf32>
        %177 = "vector.insert"(%176, %174) <{static_position = array<i64: 1, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
        %178 = "vector.extract"(%168#1) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
        %179 = "vector.broadcast"(%178) : (vector<4xf32>) -> vector<1x4xf32>
        %180 = "vector.insert"(%179, %177) <{static_position = array<i64: 1, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
        %181 = "vector.extract"(%168#1) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
        %182 = "vector.broadcast"(%181) : (vector<4xf32>) -> vector<1x4xf32>
        %183 = "vector.insert"(%182, %180) <{static_position = array<i64: 2, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
        %184 = "vector.extract"(%168#1) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
        %185 = "vector.broadcast"(%184) : (vector<4xf32>) -> vector<1x4xf32>
        %186 = "vector.insert"(%185, %183) <{static_position = array<i64: 2, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
        %187 = "vector.extract"(%168#1) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
        %188 = "vector.broadcast"(%187) : (vector<4xf32>) -> vector<1x4xf32>
        %189 = "vector.insert"(%188, %186) <{static_position = array<i64: 3, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
        %190 = "vector.extract"(%168#1) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
        %191 = "vector.broadcast"(%190) : (vector<4xf32>) -> vector<1x4xf32>
        %192 = "vector.insert"(%191, %189) <{static_position = array<i64: 3, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
        %193 = "vector.extract"(%168#1) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
        %194 = "vector.broadcast"(%193) : (vector<4xf32>) -> vector<1x4xf32>
        %195 = "vector.insert"(%194, %192) <{static_position = array<i64: 4, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
        %196 = "vector.extract"(%168#1) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
        %197 = "vector.broadcast"(%196) : (vector<4xf32>) -> vector<1x4xf32>
        %198 = "vector.insert"(%197, %195) <{static_position = array<i64: 4, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
        %199 = "vector.extract"(%168#1) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
        %200 = "vector.broadcast"(%199) : (vector<4xf32>) -> vector<1x4xf32>
        %201 = "vector.insert"(%200, %198) <{static_position = array<i64: 5, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
        %202 = "vector.extract"(%168#1) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
        %203 = "vector.broadcast"(%202) : (vector<4xf32>) -> vector<1x4xf32>
        %204 = "vector.insert"(%203, %201) <{static_position = array<i64: 5, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
        %205 = "vector.extract"(%168#1) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
        %206 = "vector.broadcast"(%205) : (vector<4xf32>) -> vector<1x4xf32>
        %207 = "vector.insert"(%206, %204) <{static_position = array<i64: 6, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
        %208 = "vector.extract"(%168#1) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
        %209 = "vector.broadcast"(%208) : (vector<4xf32>) -> vector<1x4xf32>
        %210 = "vector.insert"(%209, %207) <{static_position = array<i64: 6, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
        %211 = "vector.extract"(%168#1) <{static_position = array<i64: 0, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
        %212 = "vector.broadcast"(%211) : (vector<4xf32>) -> vector<1x4xf32>
        %213 = "vector.insert"(%212, %210) <{static_position = array<i64: 7, 0, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
        %214 = "vector.extract"(%168#1) <{static_position = array<i64: 1, 0>}> : (vector<2x1x4xf32>) -> vector<4xf32>
        %215 = "vector.broadcast"(%214) : (vector<4xf32>) -> vector<1x4xf32>
        %216 = "vector.insert"(%215, %213) <{static_position = array<i64: 7, 1, 0, 0>}> : (vector<1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
        %217 = "arith.divf"(%14, %216) <{fastmath = #arith.fastmath<none>}> : (vector<8x2x1x1x1x4xf32>, vector<8x2x1x1x1x4xf32>) -> vector<8x2x1x1x1x4xf32>
        %218 = "vector.transpose"(%217) <{permutation = array<i64: 1, 0, 3, 2, 5, 4>}> : (vector<8x2x1x1x1x4xf32>) -> vector<2x8x1x1x4x1xf32>
        %219 = "arith.mulf"(%218, %168#2) <{fastmath = #arith.fastmath<none>}> : (vector<2x8x1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %220 = "vector.transfer_read"(%90, %34) <{in_bounds = [], operandSegmentSizes = array<i32: 1, 0, 1, 0>, permutation_map = affine_map<() -> ()>}> : (memref<f32, strided<[], offset: ?>, #hal.descriptor_type<storage_buffer>>, f32) -> vector<f32>
        %221 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
        %222 = "vector.broadcast"(%221) : (f32) -> vector<4x1xf32>
        %223 = "vector.insert"(%222, %27) <{static_position = array<i64: 0, 0, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %224 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
        %225 = "vector.broadcast"(%224) : (f32) -> vector<4x1xf32>
        %226 = "vector.insert"(%225, %223) <{static_position = array<i64: 0, 1, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %227 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
        %228 = "vector.broadcast"(%227) : (f32) -> vector<4x1xf32>
        %229 = "vector.insert"(%228, %226) <{static_position = array<i64: 0, 2, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %230 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
        %231 = "vector.broadcast"(%230) : (f32) -> vector<4x1xf32>
        %232 = "vector.insert"(%231, %229) <{static_position = array<i64: 0, 3, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %233 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
        %234 = "vector.broadcast"(%233) : (f32) -> vector<4x1xf32>
        %235 = "vector.insert"(%234, %232) <{static_position = array<i64: 0, 4, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %236 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
        %237 = "vector.broadcast"(%236) : (f32) -> vector<4x1xf32>
        %238 = "vector.insert"(%237, %235) <{static_position = array<i64: 0, 5, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %239 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
        %240 = "vector.broadcast"(%239) : (f32) -> vector<4x1xf32>
        %241 = "vector.insert"(%240, %238) <{static_position = array<i64: 0, 6, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %242 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
        %243 = "vector.broadcast"(%242) : (f32) -> vector<4x1xf32>
        %244 = "vector.insert"(%243, %241) <{static_position = array<i64: 0, 7, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %245 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
        %246 = "vector.broadcast"(%245) : (f32) -> vector<4x1xf32>
        %247 = "vector.insert"(%246, %244) <{static_position = array<i64: 1, 0, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %248 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
        %249 = "vector.broadcast"(%248) : (f32) -> vector<4x1xf32>
        %250 = "vector.insert"(%249, %247) <{static_position = array<i64: 1, 1, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %251 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
        %252 = "vector.broadcast"(%251) : (f32) -> vector<4x1xf32>
        %253 = "vector.insert"(%252, %250) <{static_position = array<i64: 1, 2, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %254 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
        %255 = "vector.broadcast"(%254) : (f32) -> vector<4x1xf32>
        %256 = "vector.insert"(%255, %253) <{static_position = array<i64: 1, 3, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %257 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
        %258 = "vector.broadcast"(%257) : (f32) -> vector<4x1xf32>
        %259 = "vector.insert"(%258, %256) <{static_position = array<i64: 1, 4, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %260 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
        %261 = "vector.broadcast"(%260) : (f32) -> vector<4x1xf32>
        %262 = "vector.insert"(%261, %259) <{static_position = array<i64: 1, 5, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %263 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
        %264 = "vector.broadcast"(%263) : (f32) -> vector<4x1xf32>
        %265 = "vector.insert"(%264, %262) <{static_position = array<i64: 1, 6, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %266 = "vector.extract"(%220) <{static_position = array<i64>}> : (vector<f32>) -> f32
        %267 = "vector.broadcast"(%266) : (f32) -> vector<4x1xf32>
        %268 = "vector.insert"(%267, %265) <{static_position = array<i64: 1, 7, 0, 0>}> : (vector<4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %269 = "arith.divf"(%219, %268) <{fastmath = #arith.fastmath<none>}> : (vector<2x8x1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %270 = "arith.cmpf"(%269, %16) <{fastmath = #arith.fastmath<none>, predicate = 11 : i64}> : (vector<2x8x1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xi1>
        %271 = "arith.select"(%270, %16, %269) : (vector<2x8x1x1x4x1xi1>, vector<2x8x1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %272 = "arith.cmpf"(%271, %15) <{fastmath = #arith.fastmath<none>, predicate = 9 : i64}> : (vector<2x8x1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xi1>
        %273 = "arith.select"(%272, %15, %271) : (vector<2x8x1x1x4x1xi1>, vector<2x8x1x1x4x1xf32>, vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf32>
        %274 = "arith.truncf"(%273) : (vector<2x8x1x1x4x1xf32>) -> vector<2x8x1x1x4x1xf8E4M3FNUZ>
        %275:4 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 1, 1, 64>}> : (index) -> (index, index, index, index)
        %276:3 = "affine.delinearize_index"(%39) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index, index)
        %277 = "affine.linearize_index"(%275#2, %35, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
        %278 = "affine.linearize_index"(%275#1, %35, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %279 = "vector.extract"(%274) <{static_position = array<i64: 0, 0, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
        "vector.transfer_write"(%279, %103, %277, %278) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
        %280 = "affine.linearize_index"(%275#2, %35, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
        %281 = "affine.linearize_index"(%275#1, %30, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %282 = "vector.extract"(%274) <{static_position = array<i64: 0, 1, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
        "vector.transfer_write"(%282, %103, %280, %281) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
        %283 = "affine.linearize_index"(%275#2, %35, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
        %284 = "affine.linearize_index"(%275#1, %13, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %285 = "vector.extract"(%274) <{static_position = array<i64: 0, 2, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
        "vector.transfer_write"(%285, %103, %283, %284) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
        %286 = "affine.linearize_index"(%275#2, %35, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
        %287 = "affine.linearize_index"(%275#1, %12, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %288 = "vector.extract"(%274) <{static_position = array<i64: 0, 3, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
        "vector.transfer_write"(%288, %103, %286, %287) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
        %289 = "affine.linearize_index"(%275#2, %35, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
        %290 = "affine.linearize_index"(%275#1, %3, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %291 = "vector.extract"(%274) <{static_position = array<i64: 0, 4, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
        "vector.transfer_write"(%291, %103, %289, %290) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
        %292 = "affine.linearize_index"(%275#2, %35, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
        %293 = "affine.linearize_index"(%275#1, %2, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %294 = "vector.extract"(%274) <{static_position = array<i64: 0, 5, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
        "vector.transfer_write"(%294, %103, %292, %293) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
        %295 = "affine.linearize_index"(%275#2, %35, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
        %296 = "affine.linearize_index"(%275#1, %1, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %297 = "vector.extract"(%274) <{static_position = array<i64: 0, 6, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
        "vector.transfer_write"(%297, %103, %295, %296) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
        %298 = "affine.linearize_index"(%275#2, %35, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
        %299 = "affine.linearize_index"(%275#1, %0, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %300 = "vector.extract"(%274) <{static_position = array<i64: 0, 7, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
        "vector.transfer_write"(%300, %103, %298, %299) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
        %301 = "affine.linearize_index"(%275#2, %30, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
        %302 = "affine.linearize_index"(%275#1, %35, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %303 = "vector.extract"(%274) <{static_position = array<i64: 1, 0, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
        "vector.transfer_write"(%303, %103, %301, %302) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
        %304 = "affine.linearize_index"(%275#2, %30, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
        %305 = "affine.linearize_index"(%275#1, %30, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %306 = "vector.extract"(%274) <{static_position = array<i64: 1, 1, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
        "vector.transfer_write"(%306, %103, %304, %305) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
        %307 = "affine.linearize_index"(%275#2, %30, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
        %308 = "affine.linearize_index"(%275#1, %13, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %309 = "vector.extract"(%274) <{static_position = array<i64: 1, 2, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
        "vector.transfer_write"(%309, %103, %307, %308) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
        %310 = "affine.linearize_index"(%275#2, %30, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
        %311 = "affine.linearize_index"(%275#1, %12, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %312 = "vector.extract"(%274) <{static_position = array<i64: 1, 3, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
        "vector.transfer_write"(%312, %103, %310, %311) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
        %313 = "affine.linearize_index"(%275#2, %30, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
        %314 = "affine.linearize_index"(%275#1, %3, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %315 = "vector.extract"(%274) <{static_position = array<i64: 1, 4, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
        "vector.transfer_write"(%315, %103, %313, %314) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
        %316 = "affine.linearize_index"(%275#2, %30, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
        %317 = "affine.linearize_index"(%275#1, %2, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %318 = "vector.extract"(%274) <{static_position = array<i64: 1, 5, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
        "vector.transfer_write"(%318, %103, %316, %317) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
        %319 = "affine.linearize_index"(%275#2, %30, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
        %320 = "affine.linearize_index"(%275#1, %1, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %321 = "vector.extract"(%274) <{static_position = array<i64: 1, 6, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
        "vector.transfer_write"(%321, %103, %319, %320) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
        %322 = "affine.linearize_index"(%275#2, %30, %35, %276#1, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 2, 1, 4, 4>}> : (index, index, index, index, index) -> index
        %323 = "affine.linearize_index"(%275#1, %0, %35, %276#2, %35) <{disjoint, operandSegmentSizes = array<i32: 5, 0>, static_basis = array<i64: 1, 8, 1, 16, 1>}> : (index, index, index, index, index) -> index
        %324 = "vector.extract"(%274) <{static_position = array<i64: 1, 7, 0, 0>}> : (vector<2x8x1x1x4x1xf8E4M3FNUZ>) -> vector<4x1xf8E4M3FNUZ>
        "vector.transfer_write"(%324, %103, %322, %323) <{in_bounds = [true, true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>}> : (vector<4x1xf8E4M3FNUZ>, memref<32x128xf8E4M3FNUZ, strided<[4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%44) : (memref<1x32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>) -> ()
      "memref.dealloc"(%43) : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>) -> ()
      "memref.dealloc"(%42) : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>) -> ()
      "memref.dealloc"(%41) : (memref<32x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>) -> ()
      "memref.dealloc"(%40) : (memref<32x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
    %1 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4, #map5]} ins(%collapsed, %collapsed_1, %collapsed_2, %extracted, %arg4 : tensor<32x?x128xf8E4M3FNUZ>, tensor<32x?x128xf8E4M3FNUZ>, tensor<32x?x128xf8E4M3FNUZ>, f32, tensor<?x?xf8E4M3FNUZ>) outs(%cast : tensor<32x?x128xf32>) {