bjacob · April 24, 2025 19:25
diff --git a/log.mlir b/log.mlir
 // -----// IR Dump After LoweringConfigInterpreterPass (iree-codegen-lowering-config-interpreter) //----- //
 func.func @matmul_dispatch_0_matmul_like_Dx256x4096x4096_f8E4M3FNUZxf8E4M3FNUZxf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [512, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>, llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>} {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = arith.extui %0 : i32 to i64
  %3 = arith.extui %1 : i32 to i64
  %4 = arith.shli %3, %c32_i64 : i64
  %5 = arith.ori %2, %4 : i64
  %6 = arith.index_castui %5 : i64 to index
  %7 = util.assume.int %6<umin = 0, umax = 35184372088831> : index
  %8 = hal.interface.binding.subspan layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>
  %9 = flow.dispatch.workload.ordinal %7, 0 : index
  %10 = hal.interface.binding.subspan layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x256x4096xf8E4M3FNUZ>>{%9}
  %11 = hal.interface.binding.subspan layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<?x256x4096xf32>>{%9}
  %12 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0], sizes = [%9, 256, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x256x4096xf8E4M3FNUZ>>{%9} -> tensor<?x256x4096xf8E4M3FNUZ>
  %13 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
  %14 = tensor.empty(%9) : tensor<?x256x4096xf32>
  %15 = scf.forall (%arg0, %arg1) = (0, 0) to (%9, 4096) step (1, 256) shared_outs(%arg2 = %14) -> (tensor<?x256x4096xf32>) {
    %extracted_slice = tensor.extract_slice %12[%arg0, 0, 0] [1, 256, 4096] [1, 1, 1] : tensor<?x256x4096xf8E4M3FNUZ> to tensor<1x256x4096xf8E4M3FNUZ>
    %extracted_slice_0 = tensor.extract_slice %13[%arg1, 0] [256, 4096] [1, 1] : tensor<4096x4096xf8E4M3FNUZ> to tensor<256x4096xf8E4M3FNUZ>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, 0, %arg1] [1, 256, 256] [1, 1, 1] : tensor<?x256x4096xf32> to tensor<1x256x256xf32>
    %16 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
    %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%extracted_slice, %extracted_slice_0 : tensor<1x256x4096xf8E4M3FNUZ>, tensor<256x4096xf8E4M3FNUZ>) outs(%16 : tensor<1x256x256xf32>) attrs =  {__tuning_spec_applied__, lowering_config = #iree_gpu.lowering_config<{lowering_strategy = "cast_and_call_expanded_f8_pingpong_matmul", workgroup = [1, 256, 256, 0]}>} {
    ^bb0(%in: f8E4M3FNUZ, %in_11: f8E4M3FNUZ, %out: f32):
      %24 = arith.extf %in : f8E4M3FNUZ to f32
      %25 = arith.extf %in_11 : f8E4M3FNUZ to f32
      %26 = arith.mulf %24, %25 : f32
      %27 = arith.addf %out, %26 : f32
      linalg.yield %27 : f32
    } -> tensor<1x256x256xf32>
    %cast = tensor.cast %extracted_slice : tensor<1x256x4096xf8E4M3FNUZ> to tensor<1x256x?xf8E4M3FNUZ>
    %cast_2 = tensor.cast %extracted_slice_0 : tensor<256x4096xf8E4M3FNUZ> to tensor<256x?xf8E4M3FNUZ>
    %c0_3 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c3 = arith.constant 3 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c32 = arith.constant 32 : index
    %c64 = arith.constant 64 : index
    %c128 = arith.constant 128 : index
    %c256 = arith.constant 256 : index
    %cst_4 = arith.constant 0.000000e+00 : f8E4M3FNUZ
    %alloc = memref.alloc() : memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>>
    %alloc_5 = memref.alloc() : memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>>
    %dim = tensor.dim %cast_2, %c1 : tensor<256x?xf8E4M3FNUZ>
    %18 = iree_gpu.buffer_resource_cast %cast cacheSwizzleStride(%dim) : tensor<1x256x?xf8E4M3FNUZ>
    %19 = iree_gpu.buffer_resource_cast %cast_2 cacheSwizzleStride(%dim) : tensor<256x?xf8E4M3FNUZ>
    %20 = iree_codegen.swizzle_hint %alloc[#iree_codegen.rotate_rows<128, 8>] : memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>>
    %21 = iree_codegen.swizzle_hint %alloc_5[#iree_codegen.rotate_rows<128, 8>] : memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>>
    %expand_shape = memref.expand_shape %20 [[0, 1]] output_shape [256, 128] : memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>> into memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
    %expand_shape_6 = memref.expand_shape %21 [[0, 1]] output_shape [256, 128] : memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>> into memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
    %extracted_slice_7 = tensor.extract_slice %18[0, 0, 0] [1, 256, 128] [1, 1, 1] : tensor<1x256x?xf8E4M3FNUZ> to tensor<1x256x128xf8E4M3FNUZ>
    %extracted_slice_8 = tensor.extract_slice %19[0, 0] [256, 128] [1, 1] : tensor<256x?xf8E4M3FNUZ> to tensor<256x128xf8E4M3FNUZ>
    scf.forall (%arg3) in (2048) {
      %24:2 = affine.delinearize_index %arg3 into (256, 8) : index, index
      %25 = arith.muli %24#1, %c8 : index
      %extracted_slice_11 = tensor.extract_slice %extracted_slice_7[0, %24#0, %25] [1, 1, 16] [1, 1, 1] : tensor<1x256x128xf8E4M3FNUZ> to tensor<1x1x16xf8E4M3FNUZ>
      %26 = vector.transfer_read %extracted_slice_11[%c0_3, %c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
      vector.transfer_write %26, %expand_shape[%24#0, %25] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
    } {mapping = [#gpu.thread<linear_dim_0>]}
    scf.forall (%arg3) in (2048) {
      %24:2 = affine.delinearize_index %arg3 into (256, 8) : index, index
      %25 = arith.muli %24#1, %c8 : index
      %extracted_slice_11 = tensor.extract_slice %extracted_slice_8[%24#0, %25] [1, 16] [1, 1] : tensor<256x128xf8E4M3FNUZ> to tensor<1x16xf8E4M3FNUZ>
      %26 = vector.transfer_read %extracted_slice_11[%c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
      vector.transfer_write %26, %expand_shape_6[%24#0, %25] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
    } {mapping = [#gpu.thread<linear_dim_0>]}
    %expand_shape_9 = memref.expand_shape %expand_shape [[0, 1], [2, 3]] output_shape [16, 16, 4, 32] : memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>> into memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>
    %expand_shape_10 = memref.expand_shape %expand_shape_6 [[0, 1], [2, 3]] output_shape [16, 16, 4, 32] : memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>> into memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>
    %22 = tensor.empty() : tensor<1x16x16x16x16xf32>
    %23 = scf.forall (%arg3) in (512) shared_outs(%arg4 = %22) -> (tensor<1x16x16x16x16xf32>) {
      %24:4 = affine.delinearize_index %arg3 into (2, 4, 4, 16) : index, index, index, index
      %25 = arith.muli %24#2, %c8 : index
      %26 = arith.muli %24#0, %c8 : index
      %27 = arith.muli %24#1, %c4 : index
      %28:2 = affine.delinearize_index %arg3 into (64, 8) : index, index
      %29:3 = affine.delinearize_index %arg3 into (8, 8, 8) : index, index, index
      %30 = arith.muli %29#2, %c16 : index
      %31 = arith.muli %29#0, %c32 : index
      %32 = arith.addi %29#1, %31 : index
      %33 = arith.addi %32, %c8 : index
      %34 = arith.addi %33, %c8 : index
      %35 = arith.addi %34, %c8 : index
      %cst_11 = arith.constant dense<0.000000e+00> : vector<8x4x1x4xf32>
      %36 = arith.cmpi slt, %arg3, %c256 : index
      %37 = arith.cmpi sge, %arg3, %c256 : index
      scf.if %36 {
        rocdl.s.barrier
      }
      %38 = scf.for %arg5 = %c128 to %dim step %c128 iter_args(%arg6 = %cst_11) -> (vector<8x4x1x4xf32>) {
        %extracted_slice_12 = tensor.extract_slice %18[0, 0, %arg5] [1, 256, 128] [1, 1, 1] : tensor<1x256x?xf8E4M3FNUZ> to tensor<1x256x128xf8E4M3FNUZ>
        %extracted_slice_13 = tensor.extract_slice %extracted_slice_12[0, %32, %30] [1, 1, 16] [1, 1, 1] : tensor<1x256x128xf8E4M3FNUZ> to tensor<1x1x16xf8E4M3FNUZ>
        %54 = vector.transfer_read %extracted_slice_13[%c0_3, %c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
        %extracted_slice_14 = tensor.extract_slice %extracted_slice_12[0, %33, %30] [1, 1, 16] [1, 1, 1] : tensor<1x256x128xf8E4M3FNUZ> to tensor<1x1x16xf8E4M3FNUZ>
        %55 = vector.transfer_read %extracted_slice_14[%c0_3, %c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
        %extracted_slice_15 = tensor.extract_slice %extracted_slice_12[0, %34, %30] [1, 1, 16] [1, 1, 1] : tensor<1x256x128xf8E4M3FNUZ> to tensor<1x1x16xf8E4M3FNUZ>
        %56 = vector.transfer_read %extracted_slice_15[%c0_3, %c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
        %extracted_slice_16 = tensor.extract_slice %extracted_slice_12[0, %35, %30] [1, 1, 16] [1, 1, 1] : tensor<1x256x128xf8E4M3FNUZ> to tensor<1x1x16xf8E4M3FNUZ>
        %57 = vector.transfer_read %extracted_slice_16[%c0_3, %c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
        %58 = vector.transfer_read %expand_shape_9[%26, %24#3, %c0_3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ>
        %59 = vector.transfer_read %expand_shape_10[%27, %24#3, %c0_3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ>
        gpu.barrier
        rocdl.sched.barrier 0
        rocdl.s.setprio 1 {iree_gpu.swap_mfma = 1 : i64}
        %60 = iree_gpu.multi_mma %58, %59, %arg6 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
        rocdl.s.setprio 0
        gpu.barrier
        rocdl.sched.barrier 0
        %extracted_slice_17 = tensor.extract_slice %19[0, %arg5] [256, 128] [1, 1] : tensor<256x?xf8E4M3FNUZ> to tensor<256x128xf8E4M3FNUZ>
        %extracted_slice_18 = tensor.extract_slice %extracted_slice_17[%32, %30] [1, 16] [1, 1] : tensor<256x128xf8E4M3FNUZ> to tensor<1x16xf8E4M3FNUZ>
        %61 = vector.transfer_read %extracted_slice_18[%c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
        %extracted_slice_19 = tensor.extract_slice %extracted_slice_17[%33, %30] [1, 16] [1, 1] : tensor<256x128xf8E4M3FNUZ> to tensor<1x16xf8E4M3FNUZ>
        %62 = vector.transfer_read %extracted_slice_19[%c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
        %extracted_slice_20 = tensor.extract_slice %extracted_slice_17[%34, %30] [1, 16] [1, 1] : tensor<256x128xf8E4M3FNUZ> to tensor<1x16xf8E4M3FNUZ>
        %63 = vector.transfer_read %extracted_slice_20[%c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
        %extracted_slice_21 = tensor.extract_slice %extracted_slice_17[%35, %30] [1, 16] [1, 1] : tensor<256x128xf8E4M3FNUZ> to tensor<1x16xf8E4M3FNUZ>
        %64 = vector.transfer_read %extracted_slice_21[%c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
        %65 = vector.transfer_read %expand_shape_9[%26, %24#3, %c1, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ>
        %66 = vector.transfer_read %expand_shape_10[%27, %24#3, %c1, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ>
        gpu.barrier
        rocdl.sched.barrier 0
        rocdl.s.setprio 1 {iree_gpu.swap_mfma = 1 : i64}
        %67 = iree_gpu.multi_mma %65, %66, %60 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
        rocdl.s.setprio 0
        gpu.barrier
        rocdl.sched.barrier 0
        %68 = vector.transfer_read %expand_shape_9[%26, %24#3, %c2, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ>
        %69 = vector.transfer_read %expand_shape_10[%27, %24#3, %c2, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ>
        %70 = vector.transfer_read %expand_shape_9[%26, %24#3, %c3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ>
        %71 = vector.transfer_read %expand_shape_10[%27, %24#3, %c3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ>
        gpu.barrier
        rocdl.sched.barrier 0
        rocdl.s.setprio 1 {iree_gpu.swap_mfma = 1 : i64}
        %72 = iree_gpu.multi_mma %68, %69, %67 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
        rocdl.s.setprio 0
        gpu.barrier
        rocdl.sched.barrier 0
        vector.transfer_write %54, %expand_shape[%32, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
        vector.transfer_write %55, %expand_shape[%33, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
        vector.transfer_write %56, %expand_shape[%34, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
        vector.transfer_write %57, %expand_shape[%35, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
        vector.transfer_write %61, %expand_shape_6[%32, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
        vector.transfer_write %62, %expand_shape_6[%33, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
        vector.transfer_write %63, %expand_shape_6[%34, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
        vector.transfer_write %64, %expand_shape_6[%35, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
        gpu.barrier
        rocdl.sched.barrier 0
        rocdl.s.setprio 1 {iree_gpu.swap_mfma = 1 : i64}
        %73 = iree_gpu.multi_mma %70, %71, %72 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
        rocdl.s.setprio 0
        gpu.barrier
        rocdl.sched.barrier 0
        scf.yield %73 : vector<8x4x1x4xf32>
      }
      scf.if %37 {
        rocdl.s.barrier
      }
      %39 = vector.transfer_read %expand_shape_9[%26, %24#3, %c0_3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ>
      %40 = vector.transfer_read %expand_shape_10[%27, %24#3, %c0_3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ>
      %41 = iree_gpu.multi_mma %39, %40, %38 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
      %42 = vector.transfer_read %expand_shape_9[%26, %24#3, %c1, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ>
      %43 = vector.transfer_read %expand_shape_10[%27, %24#3, %c1, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ>
      %44 = iree_gpu.multi_mma %42, %43, %41 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
      %45 = vector.transfer_read %expand_shape_9[%26, %24#3, %c2, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ>
      %46 = vector.transfer_read %expand_shape_10[%27, %24#3, %c2, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ>
      %47 = iree_gpu.multi_mma %45, %46, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
      %48 = vector.transfer_read %expand_shape_9[%26, %24#3, %c3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ>
      %49 = vector.transfer_read %expand_shape_10[%27, %24#3, %c3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ>
      %50 = iree_gpu.multi_mma %48, %49, %47 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
      %51 = vector.transpose %50, [0, 2, 1, 3] : vector<8x4x1x4xf32> to vector<8x1x4x4xf32>
      %52 = tensor.empty() : tensor<1x8x1x4x4xf32>
      %53 = vector.transfer_write %51, %52[%c0_3, %c0_3, %c0_3, %c0_3, %c0_3] {in_bounds = [true, true, true, true]} : vector<8x1x4x4xf32>, tensor<1x8x1x4x4xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %53 into %arg4[0, %26, %24#3, %27, %25] [1, 8, 1, 4, 4] [1, 1, 1, 1, 1] : tensor<1x8x1x4x4xf32> into tensor<1x16x16x16x16xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_0>]}
    %collapsed = tensor.collapse_shape %23 [[0], [1, 2], [3, 4]] : tensor<1x16x16x16x16xf32> into tensor<1x256x256xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %collapsed into %arg2[%arg0, 0, %arg1] [1, 256, 256] [1, 1, 1] : tensor<1x256x256xf32> into tensor<?x256x4096xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %15, %11, offsets = [0, 0, 0], sizes = [%9, 256, 4096], strides = [1, 1, 1] : tensor<?x256x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x256x4096xf32>>{%9}
  return
 }