bjacob · October 4, 2024 20:55
diff --git a/log.mlir b/log.mlir
 // -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
 module {
  func.func @foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32>
    return %0 : tensor<?x?xi32>
  }
 }


 // -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- //
 module {
  util.func public @foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32>
    util.return %0 : tensor<?x?xi32>
  }
 }


 // -----// IR Dump After ImportMLProgramPass (iree-import-ml-program) //----- //
 module {
  util.func public @foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32>
    util.return %0 : tensor<?x?xi32>
  }
 }


 // -----// IR Dump After SanitizeModuleNamesPass (iree-sanitize-module-names) //----- //
 module {
  util.func public @foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32>
    util.return %0 : tensor<?x?xi32>
  }
 }


 // -----// IR Dump After ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- //
 module {
  util.func public @foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32>
    util.return %0 : tensor<?x?xi32>
  }
 }


 // -----// IR Dump After DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- //
 module {
  util.func public @foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32>
    util.return %0 : tensor<?x?xi32>
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- //
 module {
  util.func public @foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32>
    util.return %0 : tensor<?x?xi32>
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- //
 module {
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = util.call @_foo(%2, %5, %8) : (tensor<?x?xi8>, tensor<?x?xi8>, tensor<?x?xi32>) -> tensor<?x?xi32>
    %c0 = arith.constant 0 : index
    %dim = tensor.dim %9, %c0 : tensor<?x?xi32>
    %c1 = arith.constant 1 : index
    %dim_0 = tensor.dim %9, %c1 : tensor<?x?xi32>
    %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%dim, %dim_0} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
  util.func private @_foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32>
    util.return %0 : tensor<?x?xi32>
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func private @_foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> {
  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32>
  util.return %0 : tensor<?x?xi32>
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = util.call @_foo(%2, %5, %8) : (tensor<?x?xi8>, tensor<?x?xi8>, tensor<?x?xi32>) -> tensor<?x?xi32>
  %dim = tensor.dim %9, %c0 : tensor<?x?xi32>
  %dim_0 = tensor.dim %9, %c1 : tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%dim, %dim_0} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After Inliner (inline) //----- //
 module {
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After SymbolDCE (symbol-dce) //----- //
 module {
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {hal.device.targets = [#device_target_hip]} {
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After Convert1X1FilterConv2DToMatmulPass (iree-global-opt-convert-1x1-filter-conv2d-to-matmul) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After SetEncodingPass (iree-dispatch-creation-set-encoding) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = iree_encoding.set_encoding %2 : tensor<?x?xi8> -> tensor<?x?xi8, #iree_encoding.encoding<operand_index = 0 : index, op_type =  matmul, element_types = [i8, i8, i32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>
  %10 = iree_encoding.set_encoding %5 : tensor<?x?xi8> -> tensor<?x?xi8, #iree_encoding.encoding<operand_index = 1 : index, op_type =  matmul, element_types = [i8, i8, i32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>
  %11 = iree_encoding.set_encoding %8 : tensor<?x?xi32> -> tensor<?x?xi32, #iree_encoding.encoding<operand_index = 2 : index, op_type =  matmul, element_types = [i8, i8, i32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>
  %12 = linalg.matmul ins(%9, %10 : tensor<?x?xi8, #iree_encoding.encoding<operand_index = 0 : index, op_type =  matmul, element_types = [i8, i8, i32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>, tensor<?x?xi8, #iree_encoding.encoding<operand_index = 1 : index, op_type =  matmul, element_types = [i8, i8, i32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>) outs(%11 : tensor<?x?xi32, #iree_encoding.encoding<operand_index = 2 : index, op_type =  matmul, element_types = [i8, i8, i32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>) -> tensor<?x?xi32, #iree_encoding.encoding<operand_index = 2 : index, op_type =  matmul, element_types = [i8, i8, i32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>
  %dim = tensor.dim %8, %c0 : tensor<?x?xi32>
  %dim_0 = tensor.dim %8, %c1 : tensor<?x?xi32>
  %13 = iree_encoding.unset_encoding %12 : tensor<?x?xi32, #iree_encoding.encoding<operand_index = 2 : index, op_type =  matmul, element_types = [i8, i8, i32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>> -> tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %13[0, 0] [%dim, %dim_0] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %14 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %14 : !hal.buffer_view
 }

 // -----// IR Dump After GPUMaterializeHostEncodingPass (iree-codegen-gpu-materialize-host-encoding) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map3 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %dim = tensor.dim %2, %c0 : tensor<?x?xi8>
    %dim_0 = tensor.dim %2, %c1 : tensor<?x?xi8>
    %9 = affine.apply #map()[%dim]
    %10 = affine.apply #map1()[%dim_0]
    %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
    %pack = tensor.pack %2 padding_value(%c0_i8 : i8) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [128, 64] into %11 : tensor<?x?xi8> -> tensor<?x?x128x64xi8>
    %expanded = tensor.expand_shape %pack [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
    %12 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %transposed = linalg.transpose ins(%expanded : tensor<?x?x8x16x2x4x8xi8>) outs(%12 : tensor<?x?x8x4x16x2x8xi8>) permutation = [0, 1, 2, 5, 3, 4, 6] 
    %dim_1 = tensor.dim %5, %c0 : tensor<?x?xi8>
    %dim_2 = tensor.dim %5, %c1 : tensor<?x?xi8>
    %13 = affine.apply #map()[%dim_2]
    %14 = affine.apply #map1()[%dim_1]
    %15 = tensor.empty(%13, %14) : tensor<?x?x128x64xi8>
    %pack_3 = tensor.pack %5 padding_value(%c0_i8 : i8) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [128, 64] into %15 : tensor<?x?xi8> -> tensor<?x?x128x64xi8>
    %expanded_4 = tensor.expand_shape %pack_3 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%13, %14, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
    %16 = tensor.empty(%13, %14) : tensor<?x?x4x2x4x16x2x8xi8>
    %transposed_5 = linalg.transpose ins(%expanded_4 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%16 : tensor<?x?x4x2x4x16x2x8xi8>) permutation = [0, 1, 2, 3, 6, 4, 5, 7] 
    %dim_6 = tensor.dim %8, %c0 : tensor<?x?xi32>
    %dim_7 = tensor.dim %8, %c1 : tensor<?x?xi32>
    %17 = affine.apply #map()[%dim_6]
    %18 = affine.apply #map()[%dim_7]
    %19 = tensor.empty(%17, %18) : tensor<?x?x128x128xi32>
    %pack_8 = tensor.pack %8 padding_value(%c0_i32 : i32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [128, 128] into %19 : tensor<?x?xi32> -> tensor<?x?x128x128xi32>
    %expanded_9 = tensor.expand_shape %pack_8 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%17, %18, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
    %20 = tensor.empty(%17, %18) : tensor<?x?x8x4x2x4x16x4xi32>
    %transposed_10 = linalg.transpose ins(%expanded_9 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%20 : tensor<?x?x8x4x2x4x16x4xi32>) permutation = [0, 1, 2, 5, 6, 3, 7, 4] 
    %21 = iree_gpu.multi_mma %transposed, %transposed_5, %transposed_10 {indexing_maps = [#map2, #map3, #map4], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    %dim_11 = tensor.dim %8, %c0 : tensor<?x?xi32>
    %dim_12 = tensor.dim %8, %c1 : tensor<?x?xi32>
    %dim_13 = tensor.dim %21, %c0 : tensor<?x?x8x4x2x4x16x4xi32>
    %dim_14 = tensor.dim %21, %c1 : tensor<?x?x8x4x2x4x16x4xi32>
    %22 = tensor.empty(%dim_13, %dim_14) : tensor<?x?x8x4x4x4x2x16xi32>
    %transposed_15 = linalg.transpose ins(%21 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%22 : tensor<?x?x8x4x4x4x2x16xi32>) permutation = [0, 1, 2, 5, 7, 3, 4, 6] 
    %collapsed = tensor.collapse_shape %transposed_15 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
    %23 = tensor.empty(%dim_11, %dim_12) : tensor<?x?xi32>
    %unpack = tensor.unpack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [128, 128] into %23 : tensor<?x?x128x128xi32> -> tensor<?x?xi32>
    %24 = hal.tensor.export %unpack "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %24 : !hal.buffer_view
  }
 }


 // -----// IR Dump After DecomposePackUnPackOpsPass (iree-codegen-decompose-pack-unpack-ops) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %dim = tensor.dim %2, %c0 : tensor<?x?xi8>
  %dim_0 = tensor.dim %2, %c1 : tensor<?x?xi8>
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%dim]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%dim_0]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %dim_1 = tensor.dim %2, %c0 : tensor<?x?xi8>
  %12 = affine.apply affine_map<()[s0, s1] -> (-s0 + (s1 ceildiv 128) * 128)>()[%dim_1, %dim]
  %dim_2 = tensor.dim %2, %c1 : tensor<?x?xi8>
  %13 = affine.apply affine_map<()[s0, s1] -> (-s0 + (s1 ceildiv 64) * 64)>()[%dim_2, %dim_0]
  %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %dim_3 = tensor.dim %2, %c0 : tensor<?x?xi8>
  %14 = affine.apply affine_map<()[s0, s1, s2] -> (-s0 + s1 + (s2 ceildiv 128) * 128)>()[%dim_1, %dim_3, %dim]
  %dim_4 = tensor.dim %2, %c1 : tensor<?x?xi8>
  %15 = affine.apply affine_map<()[s0, s1, s2] -> (-s0 + s1 + (s2 ceildiv 64) * 64)>()[%dim_2, %dim_4, %dim_0]
  %16 = arith.divui %14, %c128 : index
  %17 = arith.divui %15, %c64 : index
  %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %transposed = linalg.transpose ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) permutation = [0, 2, 1, 3] 
  %expanded_5 = tensor.expand_shape %transposed [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %18 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %transposed_6 = linalg.transpose ins(%expanded_5 : tensor<?x?x8x16x2x4x8xi8>) outs(%18 : tensor<?x?x8x4x16x2x8xi8>) permutation = [0, 1, 2, 5, 3, 4, 6] 
  %dim_7 = tensor.dim %5, %c0 : tensor<?x?xi8>
  %dim_8 = tensor.dim %5, %c1 : tensor<?x?xi8>
  %19 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%dim_8]
  %20 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%dim_7]
  %21 = tensor.empty(%19, %20) : tensor<?x?x128x64xi8>
  %dim_9 = tensor.dim %5, %c1 : tensor<?x?xi8>
  %22 = affine.apply affine_map<()[s0, s1] -> (-s0 + (s1 ceildiv 128) * 128)>()[%dim_9, %dim_8]
  %dim_10 = tensor.dim %5, %c0 : tensor<?x?xi8>
  %23 = affine.apply affine_map<()[s0, s1] -> (-s0 + (s1 ceildiv 64) * 64)>()[%dim_10, %dim_7]
  %padded_11 = tensor.pad %5 low[0, 0] high[%23, %22] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %dim_12 = tensor.dim %5, %c0 : tensor<?x?xi8>
  %24 = affine.apply affine_map<()[s0, s1, s2] -> (-s0 + s1 + (s2 ceildiv 64) * 64)>()[%dim_10, %dim_12, %dim_7]
  %dim_13 = tensor.dim %5, %c1 : tensor<?x?xi8>
  %25 = affine.apply affine_map<()[s0, s1, s2] -> (-s0 + s1 + (s2 ceildiv 128) * 128)>()[%dim_9, %dim_13, %dim_8]
  %26 = arith.divui %24, %c64 : index
  %27 = arith.divui %25, %c128 : index
  %expanded_14 = tensor.expand_shape %padded_11 [[0, 1], [2, 3]] output_shape [%26, 64, %27, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %transposed_15 = linalg.transpose ins(%expanded_14 : tensor<?x64x?x128xi8>) outs(%21 : tensor<?x?x128x64xi8>) permutation = [2, 0, 3, 1] 
  %expanded_16 = tensor.expand_shape %transposed_15 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%19, %20, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %28 = tensor.empty(%19, %20) : tensor<?x?x4x2x4x16x2x8xi8>
  %transposed_17 = linalg.transpose ins(%expanded_16 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%28 : tensor<?x?x4x2x4x16x2x8xi8>) permutation = [0, 1, 2, 3, 6, 4, 5, 7] 
  %dim_18 = tensor.dim %8, %c0 : tensor<?x?xi32>
  %dim_19 = tensor.dim %8, %c1 : tensor<?x?xi32>
  %29 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%dim_18]
  %30 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%dim_19]
  %31 = tensor.empty(%29, %30) : tensor<?x?x128x128xi32>
  %dim_20 = tensor.dim %8, %c0 : tensor<?x?xi32>
  %32 = affine.apply affine_map<()[s0, s1] -> (-s0 + (s1 ceildiv 128) * 128)>()[%dim_20, %dim_18]
  %dim_21 = tensor.dim %8, %c1 : tensor<?x?xi32>
  %33 = affine.apply affine_map<()[s0, s1] -> (-s0 + (s1 ceildiv 128) * 128)>()[%dim_21, %dim_19]
  %padded_22 = tensor.pad %8 low[0, 0] high[%32, %33] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i32 : i32
  } : tensor<?x?xi32> to tensor<?x?xi32>
  %dim_23 = tensor.dim %8, %c0 : tensor<?x?xi32>
  %34 = affine.apply affine_map<()[s0, s1, s2] -> (-s0 + s1 + (s2 ceildiv 128) * 128)>()[%dim_20, %dim_23, %dim_18]
  %dim_24 = tensor.dim %8, %c1 : tensor<?x?xi32>
  %35 = affine.apply affine_map<()[s0, s1, s2] -> (-s0 + s1 + (s2 ceildiv 128) * 128)>()[%dim_21, %dim_24, %dim_19]
  %36 = arith.divui %34, %c128 : index
  %37 = arith.divui %35, %c128 : index
  %expanded_25 = tensor.expand_shape %padded_22 [[0, 1], [2, 3]] output_shape [%36, 128, %37, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %transposed_26 = linalg.transpose ins(%expanded_25 : tensor<?x128x?x128xi32>) outs(%31 : tensor<?x?x128x128xi32>) permutation = [0, 2, 1, 3] 
  %expanded_27 = tensor.expand_shape %transposed_26 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%29, %30, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %38 = tensor.empty(%29, %30) : tensor<?x?x8x4x2x4x16x4xi32>
  %transposed_28 = linalg.transpose ins(%expanded_27 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%38 : tensor<?x?x8x4x2x4x16x4xi32>) permutation = [0, 1, 2, 5, 6, 3, 7, 4] 
  %39 = iree_gpu.multi_mma %transposed_6, %transposed_17, %transposed_28 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %dim_29 = tensor.dim %8, %c0 : tensor<?x?xi32>
  %dim_30 = tensor.dim %8, %c1 : tensor<?x?xi32>
  %dim_31 = tensor.dim %39, %c0 : tensor<?x?x8x4x2x4x16x4xi32>
  %dim_32 = tensor.dim %39, %c1 : tensor<?x?x8x4x2x4x16x4xi32>
  %40 = tensor.empty(%dim_31, %dim_32) : tensor<?x?x8x4x4x4x2x16xi32>
  %transposed_33 = linalg.transpose ins(%39 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%40 : tensor<?x?x8x4x4x4x2x16xi32>) permutation = [0, 1, 2, 5, 7, 3, 4, 6] 
  %collapsed = tensor.collapse_shape %transposed_33 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %41 = tensor.empty(%dim_29, %dim_30) : tensor<?x?xi32>
  %dim_34 = tensor.dim %39, %c0 : tensor<?x?x8x4x2x4x16x4xi32>
  %dim_35 = tensor.dim %39, %c1 : tensor<?x?x8x4x2x4x16x4xi32>
  %42 = tensor.empty(%dim_34, %dim_35) : tensor<?x128x?x128xi32>
  %transposed_36 = linalg.transpose ins(%collapsed : tensor<?x?x128x128xi32>) outs(%42 : tensor<?x128x?x128xi32>) permutation = [0, 2, 1, 3] 
  %collapsed_37 = tensor.collapse_shape %transposed_36 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_37[0, 0] [%dim_29, %dim_30] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %43 = linalg.copy ins(%extracted_slice : tensor<?x?xi32>) outs(%41 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %44 = hal.tensor.export %43 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %44 : !hal.buffer_view
 }

 // -----// IR Dump After MaterializeHomogeneousEncodingsPass (iree-global-opt-materialize-homogeneous-encodings) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<()[s0, s1] -> (-s0 + (s1 ceildiv 128) * 128)>
 #map3 = affine_map<()[s0, s1] -> (-s0 + (s1 ceildiv 64) * 64)>
 #map4 = affine_map<()[s0, s1, s2] -> (-s0 + s1 + (s2 ceildiv 128) * 128)>
 #map5 = affine_map<()[s0, s1, s2] -> (-s0 + s1 + (s2 ceildiv 64) * 64)>
 #map6 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map7 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map8 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c64 = arith.constant 64 : index
    %c128 = arith.constant 128 : index
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %dim = tensor.dim %2, %c0 : tensor<?x?xi8>
    %dim_0 = tensor.dim %2, %c1 : tensor<?x?xi8>
    %9 = affine.apply #map()[%dim]
    %10 = affine.apply #map1()[%dim_0]
    %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
    %dim_1 = tensor.dim %2, %c0 : tensor<?x?xi8>
    %12 = affine.apply #map2()[%dim_1, %dim]
    %dim_2 = tensor.dim %2, %c1 : tensor<?x?xi8>
    %13 = affine.apply #map3()[%dim_2, %dim_0]
    %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %dim_3 = tensor.dim %2, %c0 : tensor<?x?xi8>
    %14 = affine.apply #map4()[%dim_1, %dim_3, %dim]
    %dim_4 = tensor.dim %2, %c1 : tensor<?x?xi8>
    %15 = affine.apply #map5()[%dim_2, %dim_4, %dim_0]
    %16 = arith.divui %14, %c128 : index
    %17 = arith.divui %15, %c64 : index
    %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
    %transposed = linalg.transpose ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) permutation = [0, 2, 1, 3] 
    %expanded_5 = tensor.expand_shape %transposed [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
    %18 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %transposed_6 = linalg.transpose ins(%expanded_5 : tensor<?x?x8x16x2x4x8xi8>) outs(%18 : tensor<?x?x8x4x16x2x8xi8>) permutation = [0, 1, 2, 5, 3, 4, 6] 
    %dim_7 = tensor.dim %5, %c0 : tensor<?x?xi8>
    %dim_8 = tensor.dim %5, %c1 : tensor<?x?xi8>
    %19 = affine.apply #map()[%dim_8]
    %20 = affine.apply #map1()[%dim_7]
    %21 = tensor.empty(%19, %20) : tensor<?x?x128x64xi8>
    %dim_9 = tensor.dim %5, %c1 : tensor<?x?xi8>
    %22 = affine.apply #map2()[%dim_9, %dim_8]
    %dim_10 = tensor.dim %5, %c0 : tensor<?x?xi8>
    %23 = affine.apply #map3()[%dim_10, %dim_7]
    %padded_11 = tensor.pad %5 low[0, 0] high[%23, %22] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %dim_12 = tensor.dim %5, %c0 : tensor<?x?xi8>
    %24 = affine.apply #map5()[%dim_10, %dim_12, %dim_7]
    %dim_13 = tensor.dim %5, %c1 : tensor<?x?xi8>
    %25 = affine.apply #map4()[%dim_9, %dim_13, %dim_8]
    %26 = arith.divui %24, %c64 : index
    %27 = arith.divui %25, %c128 : index
    %expanded_14 = tensor.expand_shape %padded_11 [[0, 1], [2, 3]] output_shape [%26, 64, %27, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
    %transposed_15 = linalg.transpose ins(%expanded_14 : tensor<?x64x?x128xi8>) outs(%21 : tensor<?x?x128x64xi8>) permutation = [2, 0, 3, 1] 
    %expanded_16 = tensor.expand_shape %transposed_15 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%19, %20, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
    %28 = tensor.empty(%19, %20) : tensor<?x?x4x2x4x16x2x8xi8>
    %transposed_17 = linalg.transpose ins(%expanded_16 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%28 : tensor<?x?x4x2x4x16x2x8xi8>) permutation = [0, 1, 2, 3, 6, 4, 5, 7] 
    %dim_18 = tensor.dim %8, %c0 : tensor<?x?xi32>
    %dim_19 = tensor.dim %8, %c1 : tensor<?x?xi32>
    %29 = affine.apply #map()[%dim_18]
    %30 = affine.apply #map()[%dim_19]
    %31 = tensor.empty(%29, %30) : tensor<?x?x128x128xi32>
    %dim_20 = tensor.dim %8, %c0 : tensor<?x?xi32>
    %32 = affine.apply #map2()[%dim_20, %dim_18]
    %dim_21 = tensor.dim %8, %c1 : tensor<?x?xi32>
    %33 = affine.apply #map2()[%dim_21, %dim_19]
    %padded_22 = tensor.pad %8 low[0, 0] high[%32, %33] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i32 : i32
    } : tensor<?x?xi32> to tensor<?x?xi32>
    %dim_23 = tensor.dim %8, %c0 : tensor<?x?xi32>
    %34 = affine.apply #map4()[%dim_20, %dim_23, %dim_18]
    %dim_24 = tensor.dim %8, %c1 : tensor<?x?xi32>
    %35 = affine.apply #map4()[%dim_21, %dim_24, %dim_19]
    %36 = arith.divui %34, %c128 : index
    %37 = arith.divui %35, %c128 : index
    %expanded_25 = tensor.expand_shape %padded_22 [[0, 1], [2, 3]] output_shape [%36, 128, %37, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
    %transposed_26 = linalg.transpose ins(%expanded_25 : tensor<?x128x?x128xi32>) outs(%31 : tensor<?x?x128x128xi32>) permutation = [0, 2, 1, 3] 
    %expanded_27 = tensor.expand_shape %transposed_26 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%29, %30, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
    %38 = tensor.empty(%29, %30) : tensor<?x?x8x4x2x4x16x4xi32>
    %transposed_28 = linalg.transpose ins(%expanded_27 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%38 : tensor<?x?x8x4x2x4x16x4xi32>) permutation = [0, 1, 2, 5, 6, 3, 7, 4] 
    %39 = iree_gpu.multi_mma %transposed_6, %transposed_17, %transposed_28 {indexing_maps = [#map6, #map7, #map8], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    %dim_29 = tensor.dim %8, %c0 : tensor<?x?xi32>
    %dim_30 = tensor.dim %8, %c1 : tensor<?x?xi32>
    %dim_31 = tensor.dim %39, %c0 : tensor<?x?x8x4x2x4x16x4xi32>
    %dim_32 = tensor.dim %39, %c1 : tensor<?x?x8x4x2x4x16x4xi32>
    %40 = tensor.empty(%dim_31, %dim_32) : tensor<?x?x8x4x4x4x2x16xi32>
    %transposed_33 = linalg.transpose ins(%39 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%40 : tensor<?x?x8x4x4x4x2x16xi32>) permutation = [0, 1, 2, 5, 7, 3, 4, 6] 
    %collapsed = tensor.collapse_shape %transposed_33 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
    %41 = tensor.empty(%dim_29, %dim_30) : tensor<?x?xi32>
    %dim_34 = tensor.dim %39, %c0 : tensor<?x?x8x4x2x4x16x4xi32>
    %dim_35 = tensor.dim %39, %c1 : tensor<?x?x8x4x2x4x16x4xi32>
    %42 = tensor.empty(%dim_34, %dim_35) : tensor<?x128x?x128xi32>
    %transposed_36 = linalg.transpose ins(%collapsed : tensor<?x?x128x128xi32>) outs(%42 : tensor<?x128x?x128xi32>) permutation = [0, 2, 1, 3] 
    %collapsed_37 = tensor.collapse_shape %transposed_36 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
    %extracted_slice = tensor.extract_slice %collapsed_37[0, 0] [%dim_29, %dim_30] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
    %43 = linalg.copy ins(%extracted_slice : tensor<?x?xi32>) outs(%41 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %44 = hal.tensor.export %43 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %44 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>
 #map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>
 #map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #map6 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map7 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map8 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c64 = arith.constant 64 : index
    %c128 = arith.constant 128 : index
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = affine.apply #map1()[%1]
    %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
    %12 = affine.apply #map2()[%0]
    %13 = affine.apply #map3()[%1]
    %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %14 = affine.apply #map4()[%0]
    %15 = affine.apply #map5()[%1]
    %16 = arith.divui %14, %c128 : index
    %17 = arith.divui %15, %c64 : index
    %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
    %transposed = linalg.transpose ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) permutation = [0, 2, 1, 3] 
    %expanded_0 = tensor.expand_shape %transposed [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
    %18 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %transposed_1 = linalg.transpose ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%18 : tensor<?x?x8x4x16x2x8xi8>) permutation = [0, 1, 2, 5, 3, 4, 6] 
    %19 = affine.apply #map()[%4]
    %20 = affine.apply #map1()[%3]
    %21 = tensor.empty(%19, %20) : tensor<?x?x128x64xi8>
    %22 = affine.apply #map2()[%4]
    %23 = affine.apply #map3()[%3]
    %padded_2 = tensor.pad %5 low[0, 0] high[%23, %22] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %24 = affine.apply #map5()[%3]
    %25 = affine.apply #map4()[%4]
    %26 = arith.divui %24, %c64 : index
    %27 = arith.divui %25, %c128 : index
    %expanded_3 = tensor.expand_shape %padded_2 [[0, 1], [2, 3]] output_shape [%26, 64, %27, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
    %transposed_4 = linalg.transpose ins(%expanded_3 : tensor<?x64x?x128xi8>) outs(%21 : tensor<?x?x128x64xi8>) permutation = [2, 0, 3, 1] 
    %expanded_5 = tensor.expand_shape %transposed_4 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%19, %20, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
    %28 = tensor.empty(%19, %20) : tensor<?x?x4x2x4x16x2x8xi8>
    %transposed_6 = linalg.transpose ins(%expanded_5 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%28 : tensor<?x?x4x2x4x16x2x8xi8>) permutation = [0, 1, 2, 3, 6, 4, 5, 7] 
    %29 = affine.apply #map()[%6]
    %30 = affine.apply #map()[%7]
    %31 = tensor.empty(%29, %30) : tensor<?x?x128x128xi32>
    %32 = affine.apply #map2()[%6]
    %33 = affine.apply #map2()[%7]
    %padded_7 = tensor.pad %8 low[0, 0] high[%32, %33] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i32 : i32
    } : tensor<?x?xi32> to tensor<?x?xi32>
    %34 = affine.apply #map4()[%6]
    %35 = affine.apply #map4()[%7]
    %36 = arith.divui %34, %c128 : index
    %37 = arith.divui %35, %c128 : index
    %expanded_8 = tensor.expand_shape %padded_7 [[0, 1], [2, 3]] output_shape [%36, 128, %37, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
    %transposed_9 = linalg.transpose ins(%expanded_8 : tensor<?x128x?x128xi32>) outs(%31 : tensor<?x?x128x128xi32>) permutation = [0, 2, 1, 3] 
    %expanded_10 = tensor.expand_shape %transposed_9 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%29, %30, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
    %38 = tensor.empty(%29, %30) : tensor<?x?x8x4x2x4x16x4xi32>
    %transposed_11 = linalg.transpose ins(%expanded_10 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%38 : tensor<?x?x8x4x2x4x16x4xi32>) permutation = [0, 1, 2, 5, 6, 3, 7, 4] 
    %39 = iree_gpu.multi_mma %transposed_1, %transposed_6, %transposed_11 {indexing_maps = [#map6, #map7, #map8], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    %40 = tensor.empty(%29, %30) : tensor<?x?x8x4x4x4x2x16xi32>
    %transposed_12 = linalg.transpose ins(%39 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%40 : tensor<?x?x8x4x4x4x2x16xi32>) permutation = [0, 1, 2, 5, 7, 3, 4, 6] 
    %collapsed = tensor.collapse_shape %transposed_12 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
    %41 = tensor.empty(%6, %7) : tensor<?x?xi32>
    %42 = tensor.empty(%29, %30) : tensor<?x128x?x128xi32>
    %transposed_13 = linalg.transpose ins(%collapsed : tensor<?x?x128x128xi32>) outs(%42 : tensor<?x128x?x128xi32>) permutation = [0, 2, 1, 3] 
    %collapsed_14 = tensor.collapse_shape %transposed_13 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
    %extracted_slice = tensor.extract_slice %collapsed_14[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
    %43 = linalg.copy ins(%extracted_slice : tensor<?x?xi32>) outs(%41 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %44 = hal.tensor.export %43 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %44 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CSE (cse) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>
 #map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>
 #map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #map6 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map7 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map8 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c64 = arith.constant 64 : index
    %c128 = arith.constant 128 : index
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = affine.apply #map1()[%1]
    %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
    %12 = affine.apply #map2()[%0]
    %13 = affine.apply #map3()[%1]
    %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %14 = affine.apply #map4()[%0]
    %15 = affine.apply #map5()[%1]
    %16 = arith.divui %14, %c128 : index
    %17 = arith.divui %15, %c64 : index
    %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
    %transposed = linalg.transpose ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) permutation = [0, 2, 1, 3] 
    %expanded_0 = tensor.expand_shape %transposed [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
    %18 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %transposed_1 = linalg.transpose ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%18 : tensor<?x?x8x4x16x2x8xi8>) permutation = [0, 1, 2, 5, 3, 4, 6] 
    %19 = affine.apply #map()[%4]
    %20 = affine.apply #map1()[%3]
    %21 = tensor.empty(%19, %20) : tensor<?x?x128x64xi8>
    %22 = affine.apply #map2()[%4]
    %23 = affine.apply #map3()[%3]
    %padded_2 = tensor.pad %5 low[0, 0] high[%23, %22] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %24 = affine.apply #map5()[%3]
    %25 = affine.apply #map4()[%4]
    %26 = arith.divui %24, %c64 : index
    %27 = arith.divui %25, %c128 : index
    %expanded_3 = tensor.expand_shape %padded_2 [[0, 1], [2, 3]] output_shape [%26, 64, %27, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
    %transposed_4 = linalg.transpose ins(%expanded_3 : tensor<?x64x?x128xi8>) outs(%21 : tensor<?x?x128x64xi8>) permutation = [2, 0, 3, 1] 
    %expanded_5 = tensor.expand_shape %transposed_4 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%19, %20, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
    %28 = tensor.empty(%19, %20) : tensor<?x?x4x2x4x16x2x8xi8>
    %transposed_6 = linalg.transpose ins(%expanded_5 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%28 : tensor<?x?x4x2x4x16x2x8xi8>) permutation = [0, 1, 2, 3, 6, 4, 5, 7] 
    %29 = affine.apply #map()[%6]
    %30 = affine.apply #map()[%7]
    %31 = tensor.empty(%29, %30) : tensor<?x?x128x128xi32>
    %32 = affine.apply #map2()[%6]
    %33 = affine.apply #map2()[%7]
    %padded_7 = tensor.pad %8 low[0, 0] high[%32, %33] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i32 : i32
    } : tensor<?x?xi32> to tensor<?x?xi32>
    %34 = affine.apply #map4()[%6]
    %35 = affine.apply #map4()[%7]
    %36 = arith.divui %34, %c128 : index
    %37 = arith.divui %35, %c128 : index
    %expanded_8 = tensor.expand_shape %padded_7 [[0, 1], [2, 3]] output_shape [%36, 128, %37, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
    %transposed_9 = linalg.transpose ins(%expanded_8 : tensor<?x128x?x128xi32>) outs(%31 : tensor<?x?x128x128xi32>) permutation = [0, 2, 1, 3] 
    %expanded_10 = tensor.expand_shape %transposed_9 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%29, %30, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
    %38 = tensor.empty(%29, %30) : tensor<?x?x8x4x2x4x16x4xi32>
    %transposed_11 = linalg.transpose ins(%expanded_10 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%38 : tensor<?x?x8x4x2x4x16x4xi32>) permutation = [0, 1, 2, 5, 6, 3, 7, 4] 
    %39 = iree_gpu.multi_mma %transposed_1, %transposed_6, %transposed_11 {indexing_maps = [#map6, #map7, #map8], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    %40 = tensor.empty(%29, %30) : tensor<?x?x8x4x4x4x2x16xi32>
    %transposed_12 = linalg.transpose ins(%39 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%40 : tensor<?x?x8x4x4x4x2x16xi32>) permutation = [0, 1, 2, 5, 7, 3, 4, 6] 
    %collapsed = tensor.collapse_shape %transposed_12 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
    %41 = tensor.empty(%6, %7) : tensor<?x?xi32>
    %42 = tensor.empty(%29, %30) : tensor<?x128x?x128xi32>
    %transposed_13 = linalg.transpose ins(%collapsed : tensor<?x?x128x128xi32>) outs(%42 : tensor<?x128x?x128xi32>) permutation = [0, 2, 1, 3] 
    %collapsed_14 = tensor.collapse_shape %transposed_13 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
    %extracted_slice = tensor.extract_slice %collapsed_14[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
    %43 = linalg.copy ins(%extracted_slice : tensor<?x?xi32>) outs(%41 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %44 = hal.tensor.export %43 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %44 : !hal.buffer_view
  }
 }


 // -----// IR Dump After SimplifyPackUnpackPass (iree-global-opt-simplify-pack-unpack) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>
 #map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>
 #map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #map6 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map7 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map8 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c64 = arith.constant 64 : index
    %c128 = arith.constant 128 : index
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = affine.apply #map1()[%1]
    %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
    %12 = affine.apply #map2()[%0]
    %13 = affine.apply #map3()[%1]
    %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %14 = affine.apply #map4()[%0]
    %15 = affine.apply #map5()[%1]
    %16 = arith.divui %14, %c128 : index
    %17 = arith.divui %15, %c64 : index
    %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
    %transposed = linalg.transpose ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) permutation = [0, 2, 1, 3] 
    %expanded_0 = tensor.expand_shape %transposed [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
    %18 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %transposed_1 = linalg.transpose ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%18 : tensor<?x?x8x4x16x2x8xi8>) permutation = [0, 1, 2, 5, 3, 4, 6] 
    %19 = affine.apply #map()[%4]
    %20 = affine.apply #map1()[%3]
    %21 = tensor.empty(%19, %20) : tensor<?x?x128x64xi8>
    %22 = affine.apply #map2()[%4]
    %23 = affine.apply #map3()[%3]
    %padded_2 = tensor.pad %5 low[0, 0] high[%23, %22] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %24 = affine.apply #map5()[%3]
    %25 = affine.apply #map4()[%4]
    %26 = arith.divui %24, %c64 : index
    %27 = arith.divui %25, %c128 : index
    %expanded_3 = tensor.expand_shape %padded_2 [[0, 1], [2, 3]] output_shape [%26, 64, %27, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
    %transposed_4 = linalg.transpose ins(%expanded_3 : tensor<?x64x?x128xi8>) outs(%21 : tensor<?x?x128x64xi8>) permutation = [2, 0, 3, 1] 
    %expanded_5 = tensor.expand_shape %transposed_4 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%19, %20, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
    %28 = tensor.empty(%19, %20) : tensor<?x?x4x2x4x16x2x8xi8>
    %transposed_6 = linalg.transpose ins(%expanded_5 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%28 : tensor<?x?x4x2x4x16x2x8xi8>) permutation = [0, 1, 2, 3, 6, 4, 5, 7] 
    %29 = affine.apply #map()[%6]
    %30 = affine.apply #map()[%7]
    %31 = tensor.empty(%29, %30) : tensor<?x?x128x128xi32>
    %32 = affine.apply #map2()[%6]
    %33 = affine.apply #map2()[%7]
    %padded_7 = tensor.pad %8 low[0, 0] high[%32, %33] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i32 : i32
    } : tensor<?x?xi32> to tensor<?x?xi32>
    %34 = affine.apply #map4()[%6]
    %35 = affine.apply #map4()[%7]
    %36 = arith.divui %34, %c128 : index
    %37 = arith.divui %35, %c128 : index
    %expanded_8 = tensor.expand_shape %padded_7 [[0, 1], [2, 3]] output_shape [%36, 128, %37, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
    %transposed_9 = linalg.transpose ins(%expanded_8 : tensor<?x128x?x128xi32>) outs(%31 : tensor<?x?x128x128xi32>) permutation = [0, 2, 1, 3] 
    %expanded_10 = tensor.expand_shape %transposed_9 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%29, %30, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
    %38 = tensor.empty(%29, %30) : tensor<?x?x8x4x2x4x16x4xi32>
    %transposed_11 = linalg.transpose ins(%expanded_10 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%38 : tensor<?x?x8x4x2x4x16x4xi32>) permutation = [0, 1, 2, 5, 6, 3, 7, 4] 
    %39 = iree_gpu.multi_mma %transposed_1, %transposed_6, %transposed_11 {indexing_maps = [#map6, #map7, #map8], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    %40 = tensor.empty(%29, %30) : tensor<?x?x8x4x4x4x2x16xi32>
    %transposed_12 = linalg.transpose ins(%39 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%40 : tensor<?x?x8x4x4x4x2x16xi32>) permutation = [0, 1, 2, 5, 7, 3, 4, 6] 
    %collapsed = tensor.collapse_shape %transposed_12 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
    %41 = tensor.empty(%6, %7) : tensor<?x?xi32>
    %42 = tensor.empty(%29, %30) : tensor<?x128x?x128xi32>
    %transposed_13 = linalg.transpose ins(%collapsed : tensor<?x?x128x128xi32>) outs(%42 : tensor<?x128x?x128xi32>) permutation = [0, 2, 1, 3] 
    %collapsed_14 = tensor.collapse_shape %transposed_13 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
    %extracted_slice = tensor.extract_slice %collapsed_14[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
    %43 = linalg.copy ins(%extracted_slice : tensor<?x?xi32>) outs(%41 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %44 = hal.tensor.export %43 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %44 : !hal.buffer_view
  }
 }


 // -----// IR Dump After DataLayoutPropagationPass (iree-global-opt-data-layout-propagation) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1]
  %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %16 = arith.divui %14, %c128 : index
  %17 = arith.divui %15, %c64 : index
  %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %transposed = linalg.transpose ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) permutation = [0, 2, 1, 3] 
  %expanded_0 = tensor.expand_shape %transposed [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %18 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %transposed_1 = linalg.transpose ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%18 : tensor<?x?x8x4x16x2x8xi8>) permutation = [0, 1, 2, 5, 3, 4, 6] 
  %19 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %20 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %21 = tensor.empty(%19, %20) : tensor<?x?x128x64xi8>
  %22 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4]
  %23 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3]
  %padded_2 = tensor.pad %5 low[0, 0] high[%23, %22] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %26 = arith.divui %24, %c64 : index
  %27 = arith.divui %25, %c128 : index
  %expanded_3 = tensor.expand_shape %padded_2 [[0, 1], [2, 3]] output_shape [%26, 64, %27, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %transposed_4 = linalg.transpose ins(%expanded_3 : tensor<?x64x?x128xi8>) outs(%21 : tensor<?x?x128x64xi8>) permutation = [2, 0, 3, 1] 
  %expanded_5 = tensor.expand_shape %transposed_4 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%19, %20, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %28 = tensor.empty(%19, %20) : tensor<?x?x4x2x4x16x2x8xi8>
  %transposed_6 = linalg.transpose ins(%expanded_5 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%28 : tensor<?x?x4x2x4x16x2x8xi8>) permutation = [0, 1, 2, 3, 6, 4, 5, 7] 
  %29 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %30 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %31 = tensor.empty(%29, %30) : tensor<?x?x128x128xi32>
  %32 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6]
  %33 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7]
  %padded_7 = tensor.pad %8 low[0, 0] high[%32, %33] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i32 : i32
  } : tensor<?x?xi32> to tensor<?x?xi32>
  %34 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %35 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %36 = arith.divui %34, %c128 : index
  %37 = arith.divui %35, %c128 : index
  %expanded_8 = tensor.expand_shape %padded_7 [[0, 1], [2, 3]] output_shape [%36, 128, %37, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %transposed_9 = linalg.transpose ins(%expanded_8 : tensor<?x128x?x128xi32>) outs(%31 : tensor<?x?x128x128xi32>) permutation = [0, 2, 1, 3] 
  %expanded_10 = tensor.expand_shape %transposed_9 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%29, %30, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %38 = tensor.empty(%29, %30) : tensor<?x?x8x4x2x4x16x4xi32>
  %transposed_11 = linalg.transpose ins(%expanded_10 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%38 : tensor<?x?x8x4x2x4x16x4xi32>) permutation = [0, 1, 2, 5, 6, 3, 7, 4] 
  %39 = iree_gpu.multi_mma %transposed_1, %transposed_6, %transposed_11 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %40 = tensor.empty(%29, %30) : tensor<?x?x8x4x4x4x2x16xi32>
  %transposed_12 = linalg.transpose ins(%39 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%40 : tensor<?x?x8x4x4x4x2x16xi32>) permutation = [0, 1, 2, 5, 7, 3, 4, 6] 
  %collapsed = tensor.collapse_shape %transposed_12 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %41 = tensor.empty(%6, %7) : tensor<?x?xi32>
  %42 = tensor.empty(%29, %30) : tensor<?x128x?x128xi32>
  %transposed_13 = linalg.transpose ins(%collapsed : tensor<?x?x128x128xi32>) outs(%42 : tensor<?x128x?x128xi32>) permutation = [0, 2, 1, 3] 
  %collapsed_14 = tensor.collapse_shape %transposed_13 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_14[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %43 = linalg.copy ins(%extracted_slice : tensor<?x?xi32>) outs(%41 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %44 = hal.tensor.export %43 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %44 : !hal.buffer_view
 }

 // -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1]
  %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %16 = arith.divui %14, %c128 : index
  %17 = arith.divui %15, %c64 : index
  %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
  %24 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4]
  %25 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3]
  %padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %28 = arith.divui %26, %c64 : index
  %29 = arith.divui %27, %c128 : index
  %expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
  %36 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6]
  %37 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7]
  %padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i32 : i32
  } : tensor<?x?xi32> to tensor<?x?xi32>
  %38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %40 = arith.divui %38, %c128 : index
  %41 = arith.divui %39, %c128 : index
  %expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x128x128xi32>
  %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
  %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %48 = tensor.empty(%6, %7) : tensor<?x?xi32>
  %49 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
  %50 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%49 : tensor<?x128x?x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x128x?x128xi32>
  %collapsed_7 = tensor.collapse_shape %50 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %51 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice : tensor<?x?xi32>) outs(%48 : tensor<?x?xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?xi32>
  %52 = hal.tensor.export %51 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %52 : !hal.buffer_view
 }

 // -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1]
  %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %16 = arith.divui %14, %c128 : index
  %17 = arith.divui %15, %c64 : index
  %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
  %24 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4]
  %25 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3]
  %padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %28 = arith.divui %26, %c64 : index
  %29 = arith.divui %27, %c128 : index
  %expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
  %36 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6]
  %37 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7]
  %padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i32 : i32
  } : tensor<?x?xi32> to tensor<?x?xi32>
  %38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %40 = arith.divui %38, %c128 : index
  %41 = arith.divui %39, %c128 : index
  %expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x128x128xi32>
  %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
  %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %48 = tensor.empty(%6, %7) : tensor<?x?xi32>
  %49 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
  %50 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%49 : tensor<?x128x?x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x128x?x128xi32>
  %collapsed_7 = tensor.collapse_shape %50 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %51 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice : tensor<?x?xi32>) outs(%48 : tensor<?x?xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?xi32>
  %52 = hal.tensor.export %51 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %52 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1]
  %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %16 = arith.divui %14, %c128 : index
  %17 = arith.divui %15, %c64 : index
  %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
  %24 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4]
  %25 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3]
  %padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %28 = arith.divui %26, %c64 : index
  %29 = arith.divui %27, %c128 : index
  %expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
  %36 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6]
  %37 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7]
  %padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i32 : i32
  } : tensor<?x?xi32> to tensor<?x?xi32>
  %38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %40 = arith.divui %38, %c128 : index
  %41 = arith.divui %39, %c128 : index
  %expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x128x128xi32>
  %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
  %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x128x?x128xi32>
  %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %50 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1]
  %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %16 = arith.divui %14, %c128 : index
  %17 = arith.divui %15, %c64 : index
  %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
  %24 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4]
  %25 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3]
  %padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %28 = arith.divui %26, %c64 : index
  %29 = arith.divui %27, %c128 : index
  %expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
  %36 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6]
  %37 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7]
  %padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i32 : i32
  } : tensor<?x?xi32> to tensor<?x?xi32>
  %38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %40 = arith.divui %38, %c128 : index
  %41 = arith.divui %39, %c128 : index
  %expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x128x128xi32>
  %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
  %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x128x?x128xi32>
  %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %50 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1]
  %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %16 = arith.divui %14, %c128 : index
  %17 = arith.divui %15, %c64 : index
  %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
  %24 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4]
  %25 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3]
  %padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %28 = arith.divui %26, %c64 : index
  %29 = arith.divui %27, %c128 : index
  %expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
  %36 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6]
  %37 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7]
  %padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i32 : i32
  } : tensor<?x?xi32> to tensor<?x?xi32>
  %38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %40 = arith.divui %38, %c128 : index
  %41 = arith.divui %39, %c128 : index
  %expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x128x128xi32>
  %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
  %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x128x?x128xi32>
  %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %50 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>
 #map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>
 #map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
 #map7 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>
 #map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>
 #map10 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>
 #map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>
 #map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
 #map13 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>
 #map14 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map15 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map16 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c64 = arith.constant 64 : index
    %c128 = arith.constant 128 : index
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = affine.apply #map1()[%1]
    %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
    %12 = affine.apply #map2()[%0]
    %13 = affine.apply #map3()[%1]
    %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %14 = affine.apply #map4()[%0]
    %15 = affine.apply #map5()[%1]
    %16 = arith.divui %14, %c128 : index
    %17 = arith.divui %15, %c64 : index
    %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
    %18 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
    %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %20 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    %21 = affine.apply #map()[%4]
    %22 = affine.apply #map1()[%3]
    %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
    %24 = affine.apply #map2()[%4]
    %25 = affine.apply #map3()[%3]
    %padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %26 = affine.apply #map5()[%3]
    %27 = affine.apply #map4()[%4]
    %28 = arith.divui %26, %c64 : index
    %29 = arith.divui %27, %c128 : index
    %expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
    %30 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
    %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
    %32 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    %33 = affine.apply #map()[%6]
    %34 = affine.apply #map()[%7]
    %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
    %36 = affine.apply #map2()[%6]
    %37 = affine.apply #map2()[%7]
    %padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i32 : i32
    } : tensor<?x?xi32> to tensor<?x?xi32>
    %38 = affine.apply #map4()[%6]
    %39 = affine.apply #map4()[%7]
    %40 = arith.divui %38, %c128 : index
    %41 = arith.divui %39, %c128 : index
    %expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
    %42 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x128x128xi32>
    %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
    %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
    %44 = linalg.generic {indexing_maps = [#map13, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map14, #map15, #map16], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
    %47 = linalg.generic {indexing_maps = [#map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x4x4x2x16xi32>
    %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
    %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
    %49 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x128x?x128xi32>
    %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
    %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
    %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %50 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>
 #map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>
 #map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
 #map7 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>
 #map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>
 #map10 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>
 #map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>
 #map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
 #map13 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>
 #map14 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map15 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map16 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c64 = arith.constant 64 : index
    %c128 = arith.constant 128 : index
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = affine.apply #map1()[%1]
    %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
    %12 = affine.apply #map2()[%0]
    %13 = affine.apply #map3()[%1]
    %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %14 = affine.apply #map4()[%0]
    %15 = affine.apply #map5()[%1]
    %16 = arith.divui %14, %c128 : index
    %17 = arith.divui %15, %c64 : index
    %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
    %18 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
    %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %20 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    %21 = affine.apply #map()[%4]
    %22 = affine.apply #map1()[%3]
    %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
    %24 = affine.apply #map2()[%4]
    %25 = affine.apply #map3()[%3]
    %padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %26 = affine.apply #map5()[%3]
    %27 = affine.apply #map4()[%4]
    %28 = arith.divui %26, %c64 : index
    %29 = arith.divui %27, %c128 : index
    %expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
    %30 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
    %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
    %32 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    %33 = affine.apply #map()[%6]
    %34 = affine.apply #map()[%7]
    %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
    %36 = affine.apply #map2()[%6]
    %37 = affine.apply #map2()[%7]
    %padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i32 : i32
    } : tensor<?x?xi32> to tensor<?x?xi32>
    %38 = affine.apply #map4()[%6]
    %39 = affine.apply #map4()[%7]
    %40 = arith.divui %38, %c128 : index
    %41 = arith.divui %39, %c128 : index
    %expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
    %42 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x128x128xi32>
    %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
    %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
    %44 = linalg.generic {indexing_maps = [#map13, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map14, #map15, #map16], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
    %47 = linalg.generic {indexing_maps = [#map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x4x4x2x16xi32>
    %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
    %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
    %49 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x128x?x128xi32>
    %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
    %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
    %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %50 : !hal.buffer_view
  }
 }


 // -----// IR Dump After IPO (iree-util-ipo) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>
 #map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>
 #map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
 #map7 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>
 #map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>
 #map10 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>
 #map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>
 #map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
 #map13 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>
 #map14 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map15 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map16 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c64 = arith.constant 64 : index
    %c128 = arith.constant 128 : index
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = affine.apply #map1()[%1]
    %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
    %12 = affine.apply #map2()[%0]
    %13 = affine.apply #map3()[%1]
    %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %14 = affine.apply #map4()[%0]
    %15 = affine.apply #map5()[%1]
    %16 = arith.divui %14, %c128 : index
    %17 = arith.divui %15, %c64 : index
    %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
    %18 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
    %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %20 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    %21 = affine.apply #map()[%4]
    %22 = affine.apply #map1()[%3]
    %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
    %24 = affine.apply #map2()[%4]
    %25 = affine.apply #map3()[%3]
    %padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %26 = affine.apply #map5()[%3]
    %27 = affine.apply #map4()[%4]
    %28 = arith.divui %26, %c64 : index
    %29 = arith.divui %27, %c128 : index
    %expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
    %30 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
    %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
    %32 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    %33 = affine.apply #map()[%6]
    %34 = affine.apply #map()[%7]
    %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
    %36 = affine.apply #map2()[%6]
    %37 = affine.apply #map2()[%7]
    %padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i32 : i32
    } : tensor<?x?xi32> to tensor<?x?xi32>
    %38 = affine.apply #map4()[%6]
    %39 = affine.apply #map4()[%7]
    %40 = arith.divui %38, %c128 : index
    %41 = arith.divui %39, %c128 : index
    %expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
    %42 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x128x128xi32>
    %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
    %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
    %44 = linalg.generic {indexing_maps = [#map13, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map14, #map15, #map16], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
    %47 = linalg.generic {indexing_maps = [#map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x4x4x2x16xi32>
    %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
    %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
    %49 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x128x?x128xi32>
    %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
    %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
    %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %50 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>
 #map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>
 #map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
 #map7 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>
 #map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>
 #map10 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>
 #map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>
 #map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
 #map13 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>
 #map14 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map15 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map16 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c64 = arith.constant 64 : index
    %c128 = arith.constant 128 : index
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = affine.apply #map1()[%1]
    %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
    %12 = affine.apply #map2()[%0]
    %13 = affine.apply #map3()[%1]
    %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %14 = affine.apply #map4()[%0]
    %15 = affine.apply #map5()[%1]
    %16 = arith.divui %14, %c128 : index
    %17 = arith.divui %15, %c64 : index
    %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
    %18 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
    %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %20 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    %21 = affine.apply #map()[%4]
    %22 = affine.apply #map1()[%3]
    %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
    %24 = affine.apply #map2()[%4]
    %25 = affine.apply #map3()[%3]
    %padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %26 = affine.apply #map5()[%3]
    %27 = affine.apply #map4()[%4]
    %28 = arith.divui %26, %c64 : index
    %29 = arith.divui %27, %c128 : index
    %expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
    %30 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
    %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
    %32 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    %33 = affine.apply #map()[%6]
    %34 = affine.apply #map()[%7]
    %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
    %36 = affine.apply #map2()[%6]
    %37 = affine.apply #map2()[%7]
    %padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i32 : i32
    } : tensor<?x?xi32> to tensor<?x?xi32>
    %38 = affine.apply #map4()[%6]
    %39 = affine.apply #map4()[%7]
    %40 = arith.divui %38, %c128 : index
    %41 = arith.divui %39, %c128 : index
    %expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
    %42 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x128x128xi32>
    %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
    %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
    %44 = linalg.generic {indexing_maps = [#map13, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map14, #map15, #map16], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
    %47 = linalg.generic {indexing_maps = [#map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x4x4x2x16xi32>
    %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
    %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
    %49 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x128x?x128xi32>
    %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
    %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
    %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %50 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CSE (cse) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>
 #map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>
 #map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
 #map7 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>
 #map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>
 #map10 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>
 #map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>
 #map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
 #map13 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>
 #map14 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map15 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map16 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c64 = arith.constant 64 : index
    %c128 = arith.constant 128 : index
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = affine.apply #map1()[%1]
    %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
    %12 = affine.apply #map2()[%0]
    %13 = affine.apply #map3()[%1]
    %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %14 = affine.apply #map4()[%0]
    %15 = affine.apply #map5()[%1]
    %16 = arith.divui %14, %c128 : index
    %17 = arith.divui %15, %c64 : index
    %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
    %18 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
    %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %20 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    %21 = affine.apply #map()[%4]
    %22 = affine.apply #map1()[%3]
    %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
    %24 = affine.apply #map2()[%4]
    %25 = affine.apply #map3()[%3]
    %padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %26 = affine.apply #map5()[%3]
    %27 = affine.apply #map4()[%4]
    %28 = arith.divui %26, %c64 : index
    %29 = arith.divui %27, %c128 : index
    %expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
    %30 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
    %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
    %32 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    %33 = affine.apply #map()[%6]
    %34 = affine.apply #map()[%7]
    %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
    %36 = affine.apply #map2()[%6]
    %37 = affine.apply #map2()[%7]
    %padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i32 : i32
    } : tensor<?x?xi32> to tensor<?x?xi32>
    %38 = affine.apply #map4()[%6]
    %39 = affine.apply #map4()[%7]
    %40 = arith.divui %38, %c128 : index
    %41 = arith.divui %39, %c128 : index
    %expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
    %42 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x128x128xi32>
    %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
    %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
    %44 = linalg.generic {indexing_maps = [#map13, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map14, #map15, #map16], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
    %47 = linalg.generic {indexing_maps = [#map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x4x4x2x16xi32>
    %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
    %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
    %49 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x128x?x128xi32>
    %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
    %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
    %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %50 : !hal.buffer_view
  }
 }


 // -----// IR Dump After HoistIntoGlobals (iree-util-hoist-into-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>
 #map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>
 #map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
 #map7 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>
 #map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>
 #map10 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>
 #map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>
 #map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
 #map13 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>
 #map14 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map15 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map16 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c64 = arith.constant 64 : index
    %c128 = arith.constant 128 : index
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = affine.apply #map1()[%1]
    %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
    %12 = affine.apply #map2()[%0]
    %13 = affine.apply #map3()[%1]
    %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %14 = affine.apply #map4()[%0]
    %15 = affine.apply #map5()[%1]
    %16 = arith.divui %14, %c128 : index
    %17 = arith.divui %15, %c64 : index
    %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
    %18 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
    %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %20 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    %21 = affine.apply #map()[%4]
    %22 = affine.apply #map1()[%3]
    %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
    %24 = affine.apply #map2()[%4]
    %25 = affine.apply #map3()[%3]
    %padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %26 = affine.apply #map5()[%3]
    %27 = affine.apply #map4()[%4]
    %28 = arith.divui %26, %c64 : index
    %29 = arith.divui %27, %c128 : index
    %expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
    %30 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
    %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
    %32 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    %33 = affine.apply #map()[%6]
    %34 = affine.apply #map()[%7]
    %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
    %36 = affine.apply #map2()[%6]
    %37 = affine.apply #map2()[%7]
    %padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i32 : i32
    } : tensor<?x?xi32> to tensor<?x?xi32>
    %38 = affine.apply #map4()[%6]
    %39 = affine.apply #map4()[%7]
    %40 = arith.divui %38, %c128 : index
    %41 = arith.divui %39, %c128 : index
    %expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
    %42 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x128x128xi32>
    %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
    %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
    %44 = linalg.generic {indexing_maps = [#map13, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map14, #map15, #map16], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
    %47 = linalg.generic {indexing_maps = [#map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x4x4x2x16xi32>
    %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
    %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
    %49 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x128x?x128xi32>
    %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
    %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
    %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %50 : !hal.buffer_view
  }
 }


 // -----// IR Dump After JitGlobalsPass (iree-consteval-jit-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>
 #map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>
 #map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
 #map7 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>
 #map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>
 #map10 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>
 #map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>
 #map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
 #map13 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>
 #map14 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map15 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map16 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c64 = arith.constant 64 : index
    %c128 = arith.constant 128 : index
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = affine.apply #map1()[%1]
    %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
    %12 = affine.apply #map2()[%0]
    %13 = affine.apply #map3()[%1]
    %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %14 = affine.apply #map4()[%0]
    %15 = affine.apply #map5()[%1]
    %16 = arith.divui %14, %c128 : index
    %17 = arith.divui %15, %c64 : index
    %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
    %18 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
    %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %20 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    %21 = affine.apply #map()[%4]
    %22 = affine.apply #map1()[%3]
    %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
    %24 = affine.apply #map2()[%4]
    %25 = affine.apply #map3()[%3]
    %padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i8 : i8
    } : tensor<?x?xi8> to tensor<?x?xi8>
    %26 = affine.apply #map5()[%3]
    %27 = affine.apply #map4()[%4]
    %28 = arith.divui %26, %c64 : index
    %29 = arith.divui %27, %c128 : index
    %expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
    %30 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
    %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
    %32 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    %33 = affine.apply #map()[%6]
    %34 = affine.apply #map()[%7]
    %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
    %36 = affine.apply #map2()[%6]
    %37 = affine.apply #map2()[%7]
    %padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %c0_i32 : i32
    } : tensor<?x?xi32> to tensor<?x?xi32>
    %38 = affine.apply #map4()[%6]
    %39 = affine.apply #map4()[%7]
    %40 = arith.divui %38, %c128 : index
    %41 = arith.divui %39, %c128 : index
    %expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
    %42 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x128x128xi32>
    %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
    %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
    %44 = linalg.generic {indexing_maps = [#map13, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map14, #map15, #map16], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
    %47 = linalg.generic {indexing_maps = [#map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x4x4x2x16xi32>
    %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
    %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
    %49 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x128x?x128xi32>
    %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
    %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
    %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %50 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1]
  %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %16 = arith.divui %14, %c128 : index
  %17 = arith.divui %15, %c64 : index
  %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
  %24 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4]
  %25 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3]
  %padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %28 = arith.divui %26, %c64 : index
  %29 = arith.divui %27, %c128 : index
  %expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
  %36 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6]
  %37 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7]
  %padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i32 : i32
  } : tensor<?x?xi32> to tensor<?x?xi32>
  %38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %40 = arith.divui %38, %c128 : index
  %41 = arith.divui %39, %c128 : index
  %expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x128x128xi32>
  %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
  %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x128x?x128xi32>
  %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %50 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1]
  %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %16 = arith.divui %14, %c128 : index
  %17 = arith.divui %15, %c64 : index
  %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
  %24 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4]
  %25 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3]
  %padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %28 = arith.divui %26, %c64 : index
  %29 = arith.divui %27, %c128 : index
  %expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
  %36 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6]
  %37 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7]
  %padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i32 : i32
  } : tensor<?x?xi32> to tensor<?x?xi32>
  %38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %40 = arith.divui %38, %c128 : index
  %41 = arith.divui %39, %c128 : index
  %expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x128x128xi32>
  %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
  %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x128x?x128xi32>
  %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %50 : !hal.buffer_view
 }

 // -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1]
  %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %16 = arith.divui %14, %c128 : index
  %17 = arith.divui %15, %c64 : index
  %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
  %24 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4]
  %25 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3]
  %padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %28 = arith.divui %26, %c64 : index
  %29 = arith.divui %27, %c128 : index
  %expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
  %36 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6]
  %37 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7]
  %padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i32 : i32
  } : tensor<?x?xi32> to tensor<?x?xi32>
  %38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %40 = arith.divui %38, %c128 : index
  %41 = arith.divui %39, %c128 : index
  %expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x128x128xi32>
  %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
  %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x128x?x128xi32>
  %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %50 : !hal.buffer_view
 }

 // -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1]
  %padded = tensor.pad %2 low[0, 0] high[%12, %13] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %16 = arith.divui %14, %c128 : index
  %17 = arith.divui %15, %c64 : index
  %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
  %24 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4]
  %25 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3]
  %padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i8 : i8
  } : tensor<?x?xi8> to tensor<?x?xi8>
  %26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %28 = arith.divui %26, %c64 : index
  %29 = arith.divui %27, %c128 : index
  %expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
  %36 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6]
  %37 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7]
  %padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %c0_i32 : i32
  } : tensor<?x?xi32> to tensor<?x?xi32>
  %38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %40 = arith.divui %38, %c128 : index
  %41 = arith.divui %39, %c128 : index
  %expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x128x128xi32>
  %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
  %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x128x?x128xi32>
  %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %50 : !hal.buffer_view
 }

 // -----// IR Dump After TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<()[s0, s1] -> (-s0 + s1 + (s0 ceildiv 128) * 128)>
 #map3 = affine_map<()[s0, s1] -> (-s0 + s1 + (s0 ceildiv 64) * 64)>
 #map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
 #map7 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>
 #map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>
 #map10 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>
 #map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>
 #map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
 #map13 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>
 #map14 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map15 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map16 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c64 = arith.constant 64 : index
    %c128 = arith.constant 128 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = affine.apply #map1()[%1]
    %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
    %dim = tensor.dim %2, %c0 : tensor<?x?xi8>
    %12 = affine.apply #map2()[%0, %dim]
    %dim_0 = tensor.dim %2, %c1 : tensor<?x?xi8>
    %13 = affine.apply #map3()[%1, %dim_0]
    %14 = tensor.empty(%12, %13) : tensor<?x?xi8>
    %15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8>
    %dim_1 = tensor.dim %2, %c0 : tensor<?x?xi8>
    %dim_2 = tensor.dim %2, %c1 : tensor<?x?xi8>
    %inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%dim_1, %dim_2] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
    %16 = affine.apply #map4()[%0]
    %17 = affine.apply #map5()[%1]
    %18 = arith.divui %16, %c128 : index
    %19 = arith.divui %17, %c64 : index
    %expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%18, 128, %19, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
    %20 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_3 = tensor.expand_shape %20 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
    %21 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %22 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x8x16x2x4x8xi8>) outs(%21 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    %23 = affine.apply #map()[%4]
    %24 = affine.apply #map1()[%3]
    %25 = tensor.empty(%23, %24) : tensor<?x?x128x64xi8>
    %dim_4 = tensor.dim %5, %c0 : tensor<?x?xi8>
    %26 = affine.apply #map3()[%3, %dim_4]
    %dim_5 = tensor.dim %5, %c1 : tensor<?x?xi8>
    %27 = affine.apply #map2()[%4, %dim_5]
    %28 = tensor.empty(%26, %27) : tensor<?x?xi8>
    %29 = linalg.fill ins(%c0_i8 : i8) outs(%28 : tensor<?x?xi8>) -> tensor<?x?xi8>
    %dim_6 = tensor.dim %5, %c0 : tensor<?x?xi8>
    %dim_7 = tensor.dim %5, %c1 : tensor<?x?xi8>
    %inserted_slice_8 = tensor.insert_slice %5 into %29[0, 0] [%dim_6, %dim_7] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
    %30 = affine.apply #map5()[%3]
    %31 = affine.apply #map4()[%4]
    %32 = arith.divui %30, %c64 : index
    %33 = arith.divui %31, %c128 : index
    %expanded_9 = tensor.expand_shape %inserted_slice_8 [[0, 1], [2, 3]] output_shape [%32, 64, %33, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
    %34 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_9 : tensor<?x64x?x128xi8>) outs(%25 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_10 = tensor.expand_shape %34 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%23, %24, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
    %35 = tensor.empty(%23, %24) : tensor<?x?x4x2x4x16x2x8xi8>
    %36 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_10 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%35 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    %37 = affine.apply #map()[%6]
    %38 = affine.apply #map()[%7]
    %39 = tensor.empty(%37, %38) : tensor<?x?x128x128xi32>
    %dim_11 = tensor.dim %8, %c0 : tensor<?x?xi32>
    %40 = affine.apply #map2()[%6, %dim_11]
    %dim_12 = tensor.dim %8, %c1 : tensor<?x?xi32>
    %41 = affine.apply #map2()[%7, %dim_12]
    %42 = tensor.empty(%40, %41) : tensor<?x?xi32>
    %43 = linalg.fill ins(%c0_i32 : i32) outs(%42 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %dim_13 = tensor.dim %8, %c0 : tensor<?x?xi32>
    %dim_14 = tensor.dim %8, %c1 : tensor<?x?xi32>
    %inserted_slice_15 = tensor.insert_slice %8 into %43[0, 0] [%dim_13, %dim_14] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
    %44 = affine.apply #map4()[%6]
    %45 = affine.apply #map4()[%7]
    %46 = arith.divui %44, %c128 : index
    %47 = arith.divui %45, %c128 : index
    %expanded_16 = tensor.expand_shape %inserted_slice_15 [[0, 1], [2, 3]] output_shape [%46, 128, %47, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
    %48 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_16 : tensor<?x128x?x128xi32>) outs(%39 : tensor<?x?x128x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x128x128xi32>
    %expanded_17 = tensor.expand_shape %48 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%37, %38, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
    %49 = tensor.empty(%37, %38) : tensor<?x?x8x4x2x4x16x4xi32>
    %50 = linalg.generic {indexing_maps = [#map13, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_17 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%49 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    %51 = iree_gpu.multi_mma %22, %36, %50 {indexing_maps = [#map14, #map15, #map16], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    %52 = tensor.empty(%37, %38) : tensor<?x?x8x4x4x4x2x16xi32>
    %53 = linalg.generic {indexing_maps = [#map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%51 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%52 : tensor<?x?x8x4x4x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x4x4x2x16xi32>
    %collapsed = tensor.collapse_shape %53 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
    %54 = tensor.empty(%37, %38) : tensor<?x128x?x128xi32>
    %55 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%54 : tensor<?x128x?x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x128x?x128xi32>
    %collapsed_18 = tensor.collapse_shape %55 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
    %extracted_slice = tensor.extract_slice %collapsed_18[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
    %56 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %56 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %14 = tensor.empty(%12, %13) : tensor<?x?xi8>
  %15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %16 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %17 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %18 = arith.divui %16, %c128 : index
  %19 = arith.divui %17, %c64 : index
  %expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%18, 128, %19, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_0 = tensor.expand_shape %20 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %21 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%21 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %25 = tensor.empty(%23, %24) : tensor<?x?x128x64xi8>
  %26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %28 = tensor.empty(%26, %27) : tensor<?x?xi8>
  %29 = linalg.fill ins(%c0_i8 : i8) outs(%28 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_1 = tensor.insert_slice %5 into %29[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %30 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %31 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %32 = arith.divui %30, %c64 : index
  %33 = arith.divui %31, %c128 : index
  %expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%32, 64, %33, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%25 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_3 = tensor.expand_shape %34 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%23, %24, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %35 = tensor.empty(%23, %24) : tensor<?x?x4x2x4x16x2x8xi8>
  %36 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%35 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %37 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %38 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %39 = tensor.empty(%37, %38) : tensor<?x?x128x128xi32>
  %40 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %41 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %42 = tensor.empty(%40, %41) : tensor<?x?xi32>
  %43 = linalg.fill ins(%c0_i32 : i32) outs(%42 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_4 = tensor.insert_slice %8 into %43[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %44 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %45 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %46 = arith.divui %44, %c128 : index
  %47 = arith.divui %45, %c128 : index
  %expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%46, 128, %47, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %48 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%39 : tensor<?x?x128x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x128x128xi32>
  %expanded_6 = tensor.expand_shape %48 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%37, %38, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %49 = tensor.empty(%37, %38) : tensor<?x?x8x4x2x4x16x4xi32>
  %50 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%49 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %51 = iree_gpu.multi_mma %22, %36, %50 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %52 = tensor.empty(%37, %38) : tensor<?x?x8x4x4x4x2x16xi32>
  %53 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%51 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%52 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %collapsed = tensor.collapse_shape %53 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %54 = tensor.empty(%37, %38) : tensor<?x128x?x128xi32>
  %55 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%54 : tensor<?x128x?x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x128x?x128xi32>
  %collapsed_7 = tensor.collapse_shape %55 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %56 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %56 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %14 = tensor.empty(%12, %13) : tensor<?x?xi8>
  %15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %16 = arith.divui %12, %c128 : index
  %17 = arith.divui %13, %c64 : index
  %expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
  %24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %26 = tensor.empty(%24, %25) : tensor<?x?xi8>
  %27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %28 = arith.divui %24, %c64 : index
  %29 = arith.divui %25, %c128 : index
  %expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
  %36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %37 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %38 = tensor.empty(%36, %37) : tensor<?x?xi32>
  %39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %40 = arith.divui %36, %c128 : index
  %41 = arith.divui %37, %c128 : index
  %expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x128x128xi32>
  %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
  %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x128x?x128xi32>
  %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %50 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %14 = tensor.empty(%12, %13) : tensor<?x?xi8>
  %15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %16 = arith.divui %12, %c128 : index
  %17 = arith.divui %13, %c64 : index
  %expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
  %24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %26 = tensor.empty(%24, %25) : tensor<?x?xi8>
  %27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %28 = arith.divui %24, %c64 : index
  %29 = arith.divui %25, %c128 : index
  %expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
  %36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %37 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %38 = tensor.empty(%36, %37) : tensor<?x?xi32>
  %39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %40 = arith.divui %36, %c128 : index
  %41 = arith.divui %37, %c128 : index
  %expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x128x128xi32>
  %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
  %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x128x?x128xi32>
  %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %50 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map3 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map6 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>
 #map7 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>
 #map8 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>
 #map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>
 #map10 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
 #map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>
 #map12 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map13 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map14 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map15 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %c64 = arith.constant 64 : index
    %c128 = arith.constant 128 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = affine.apply #map1()[%1]
    %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
    %12 = affine.apply #map2()[%0]
    %13 = affine.apply #map3()[%1]
    %14 = tensor.empty(%12, %13) : tensor<?x?xi8>
    %15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8>
    %inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
    %16 = arith.divui %12, %c128 : index
    %17 = arith.divui %13, %c64 : index
    %expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
    %18 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
    %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %20 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    %21 = affine.apply #map()[%4]
    %22 = affine.apply #map1()[%3]
    %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
    %24 = affine.apply #map3()[%3]
    %25 = affine.apply #map2()[%4]
    %26 = tensor.empty(%24, %25) : tensor<?x?xi8>
    %27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8>
    %inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
    %28 = arith.divui %24, %c64 : index
    %29 = arith.divui %25, %c128 : index
    %expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
    %30 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
    %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
    %32 = linalg.generic {indexing_maps = [#map9, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    %33 = affine.apply #map()[%6]
    %34 = affine.apply #map()[%7]
    %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
    %36 = affine.apply #map2()[%6]
    %37 = affine.apply #map2()[%7]
    %38 = tensor.empty(%36, %37) : tensor<?x?xi32>
    %39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
    %40 = arith.divui %36, %c128 : index
    %41 = arith.divui %37, %c128 : index
    %expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
    %42 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x128x128xi32>
    %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
    %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
    %44 = linalg.generic {indexing_maps = [#map11, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map12, #map13, #map14], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
    %47 = linalg.generic {indexing_maps = [#map15, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x4x4x2x16xi32>
    %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
    %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
    %49 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x128x?x128xi32>
    %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
    %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
    %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %50 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map3 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map6 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>
 #map7 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>
 #map8 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>
 #map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>
 #map10 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
 #map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>
 #map12 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map13 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map14 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map15 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %c64 = arith.constant 64 : index
    %c128 = arith.constant 128 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = affine.apply #map1()[%1]
    %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
    %12 = affine.apply #map2()[%0]
    %13 = affine.apply #map3()[%1]
    %14 = tensor.empty(%12, %13) : tensor<?x?xi8>
    %15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8>
    %inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
    %16 = arith.divui %12, %c128 : index
    %17 = arith.divui %13, %c64 : index
    %expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
    %18 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
    %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %20 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    %21 = affine.apply #map()[%4]
    %22 = affine.apply #map1()[%3]
    %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
    %24 = affine.apply #map3()[%3]
    %25 = affine.apply #map2()[%4]
    %26 = tensor.empty(%24, %25) : tensor<?x?xi8>
    %27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8>
    %inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
    %28 = arith.divui %24, %c64 : index
    %29 = arith.divui %25, %c128 : index
    %expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
    %30 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
    %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
    %32 = linalg.generic {indexing_maps = [#map9, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    %33 = affine.apply #map()[%6]
    %34 = affine.apply #map()[%7]
    %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
    %36 = affine.apply #map2()[%6]
    %37 = affine.apply #map2()[%7]
    %38 = tensor.empty(%36, %37) : tensor<?x?xi32>
    %39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
    %40 = arith.divui %36, %c128 : index
    %41 = arith.divui %37, %c128 : index
    %expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
    %42 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x128x128xi32>
    %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
    %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
    %44 = linalg.generic {indexing_maps = [#map11, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map12, #map13, #map14], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
    %47 = linalg.generic {indexing_maps = [#map15, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x4x4x2x16xi32>
    %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
    %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
    %49 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x128x?x128xi32>
    %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
    %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
    %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %50 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map3 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map6 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>
 #map7 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>
 #map8 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>
 #map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>
 #map10 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
 #map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>
 #map12 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map13 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map14 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map15 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %c64 = arith.constant 64 : index
    %c128 = arith.constant 128 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = affine.apply #map1()[%1]
    %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
    %12 = affine.apply #map2()[%0]
    %13 = affine.apply #map3()[%1]
    %14 = tensor.empty(%12, %13) : tensor<?x?xi8>
    %15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8>
    %inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
    %16 = arith.divui %12, %c128 : index
    %17 = arith.divui %13, %c64 : index
    %expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
    %18 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
    %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %20 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    %21 = affine.apply #map()[%4]
    %22 = affine.apply #map1()[%3]
    %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
    %24 = affine.apply #map3()[%3]
    %25 = affine.apply #map2()[%4]
    %26 = tensor.empty(%24, %25) : tensor<?x?xi8>
    %27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8>
    %inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
    %28 = arith.divui %24, %c64 : index
    %29 = arith.divui %25, %c128 : index
    %expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
    %30 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
    %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
    %32 = linalg.generic {indexing_maps = [#map9, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    %33 = affine.apply #map()[%6]
    %34 = affine.apply #map()[%7]
    %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
    %36 = affine.apply #map2()[%6]
    %37 = affine.apply #map2()[%7]
    %38 = tensor.empty(%36, %37) : tensor<?x?xi32>
    %39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
    %40 = arith.divui %36, %c128 : index
    %41 = arith.divui %37, %c128 : index
    %expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
    %42 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x128x128xi32>
    %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
    %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
    %44 = linalg.generic {indexing_maps = [#map11, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map12, #map13, #map14], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
    %47 = linalg.generic {indexing_maps = [#map15, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x4x4x2x16xi32>
    %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
    %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
    %49 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x128x?x128xi32>
    %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
    %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
    %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %50 : !hal.buffer_view
  }
 }


 // -----// IR Dump After IPO (iree-util-ipo) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map3 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map6 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>
 #map7 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>
 #map8 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>
 #map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>
 #map10 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
 #map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>
 #map12 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map13 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map14 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map15 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %c64 = arith.constant 64 : index
    %c128 = arith.constant 128 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = affine.apply #map1()[%1]
    %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
    %12 = affine.apply #map2()[%0]
    %13 = affine.apply #map3()[%1]
    %14 = tensor.empty(%12, %13) : tensor<?x?xi8>
    %15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8>
    %inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
    %16 = arith.divui %12, %c128 : index
    %17 = arith.divui %13, %c64 : index
    %expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
    %18 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
    %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %20 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    %21 = affine.apply #map()[%4]
    %22 = affine.apply #map1()[%3]
    %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
    %24 = affine.apply #map3()[%3]
    %25 = affine.apply #map2()[%4]
    %26 = tensor.empty(%24, %25) : tensor<?x?xi8>
    %27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8>
    %inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
    %28 = arith.divui %24, %c64 : index
    %29 = arith.divui %25, %c128 : index
    %expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
    %30 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
    %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
    %32 = linalg.generic {indexing_maps = [#map9, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    %33 = affine.apply #map()[%6]
    %34 = affine.apply #map()[%7]
    %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
    %36 = affine.apply #map2()[%6]
    %37 = affine.apply #map2()[%7]
    %38 = tensor.empty(%36, %37) : tensor<?x?xi32>
    %39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
    %40 = arith.divui %36, %c128 : index
    %41 = arith.divui %37, %c128 : index
    %expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
    %42 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x128x128xi32>
    %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
    %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
    %44 = linalg.generic {indexing_maps = [#map11, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map12, #map13, #map14], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
    %47 = linalg.generic {indexing_maps = [#map15, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x4x4x2x16xi32>
    %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
    %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
    %49 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x128x?x128xi32>
    %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
    %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
    %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %50 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map3 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map6 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>
 #map7 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>
 #map8 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>
 #map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>
 #map10 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
 #map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>
 #map12 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map13 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map14 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map15 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %c64 = arith.constant 64 : index
    %c128 = arith.constant 128 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = affine.apply #map1()[%1]
    %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
    %12 = affine.apply #map2()[%0]
    %13 = affine.apply #map3()[%1]
    %14 = tensor.empty(%12, %13) : tensor<?x?xi8>
    %15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8>
    %inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
    %16 = arith.divui %12, %c128 : index
    %17 = arith.divui %13, %c64 : index
    %expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
    %18 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
    %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %20 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    %21 = affine.apply #map()[%4]
    %22 = affine.apply #map1()[%3]
    %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
    %24 = affine.apply #map3()[%3]
    %25 = affine.apply #map2()[%4]
    %26 = tensor.empty(%24, %25) : tensor<?x?xi8>
    %27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8>
    %inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
    %28 = arith.divui %24, %c64 : index
    %29 = arith.divui %25, %c128 : index
    %expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
    %30 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x128x64xi8>
    %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
    %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
    %32 = linalg.generic {indexing_maps = [#map9, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    %33 = affine.apply #map()[%6]
    %34 = affine.apply #map()[%7]
    %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
    %36 = affine.apply #map2()[%6]
    %37 = affine.apply #map2()[%7]
    %38 = tensor.empty(%36, %37) : tensor<?x?xi32>
    %39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
    %40 = arith.divui %36, %c128 : index
    %41 = arith.divui %37, %c128 : index
    %expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
    %42 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x128x128xi32>
    %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
    %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
    %44 = linalg.generic {indexing_maps = [#map11, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map12, #map13, #map14], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
    %47 = linalg.generic {indexing_maps = [#map15, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x4x4x2x16xi32>
    %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
    %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
    %49 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x128x?x128xi32>
    %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
    %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
    %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %50 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %14 = tensor.empty(%12, %13) : tensor<?x?xi8>
  %15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %16 = arith.divui %12, %c128 : index
  %17 = arith.divui %13, %c64 : index
  %expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
  %24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %26 = tensor.empty(%24, %25) : tensor<?x?xi8>
  %27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %28 = arith.divui %24, %c64 : index
  %29 = arith.divui %25, %c128 : index
  %expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
  %36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %37 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %38 = tensor.empty(%36, %37) : tensor<?x?xi32>
  %39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %40 = arith.divui %36, %c128 : index
  %41 = arith.divui %37, %c128 : index
  %expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x128x128xi32>
  %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
  %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x128x?x128xi32>
  %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %50 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %14 = tensor.empty(%12, %13) : tensor<?x?xi8>
  %15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %16 = arith.divui %12, %c128 : index
  %17 = arith.divui %13, %c64 : index
  %expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
  %24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %26 = tensor.empty(%24, %25) : tensor<?x?xi8>
  %27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %28 = arith.divui %24, %c64 : index
  %29 = arith.divui %25, %c128 : index
  %expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
  %36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %37 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %38 = tensor.empty(%36, %37) : tensor<?x?xi32>
  %39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %40 = arith.divui %36, %c128 : index
  %41 = arith.divui %37, %c128 : index
  %expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x128x128xi32>
  %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
  %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x128x?x128xi32>
  %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %50 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %14 = tensor.empty(%12, %13) : tensor<?x?xi8>
  %15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %16 = arith.divui %12, %c128 : index
  %17 = arith.divui %13, %c64 : index
  %expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
  %24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %26 = tensor.empty(%24, %25) : tensor<?x?xi8>
  %27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %28 = arith.divui %24, %c64 : index
  %29 = arith.divui %25, %c128 : index
  %expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
  %36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %37 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %38 = tensor.empty(%36, %37) : tensor<?x?xi32>
  %39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %40 = arith.divui %36, %c128 : index
  %41 = arith.divui %37, %c128 : index
  %expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x128x128xi32>
  %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
  %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x128x?x128xi32>
  %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %50 : !hal.buffer_view
 }

 // -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %14 = tensor.empty(%12, %13) : tensor<?x?xi8>
  %15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %16 = arith.divui %12, %c128 : index
  %17 = arith.divui %13, %c64 : index
  %expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
  %24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %26 = tensor.empty(%24, %25) : tensor<?x?xi8>
  %27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %28 = arith.divui %24, %c64 : index
  %29 = arith.divui %25, %c128 : index
  %expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
  %36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %37 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %38 = tensor.empty(%36, %37) : tensor<?x?xi32>
  %39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %40 = arith.divui %36, %c128 : index
  %41 = arith.divui %37, %c128 : index
  %expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x128x128xi32>
  %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
  %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x128x?x128xi32>
  %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %50 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %14 = tensor.empty(%12, %13) : tensor<?x?xi8>
  %15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %16 = arith.divui %12, %c128 : index
  %17 = arith.divui %13, %c64 : index
  %expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
  %24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %26 = tensor.empty(%24, %25) : tensor<?x?xi8>
  %27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %28 = arith.divui %24, %c64 : index
  %29 = arith.divui %25, %c128 : index
  %expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
  %36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %37 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %38 = tensor.empty(%36, %37) : tensor<?x?xi32>
  %39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %40 = arith.divui %36, %c128 : index
  %41 = arith.divui %37, %c128 : index
  %expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x128x128xi32>
  %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
  %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x128x?x128xi32>
  %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %50 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %14 = tensor.empty(%12, %13) : tensor<?x?xi8>
  %15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %16 = arith.divui %12, %c128 : index
  %17 = arith.divui %13, %c64 : index
  %expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8>
  %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8>
  %24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %26 = tensor.empty(%24, %25) : tensor<?x?xi8>
  %27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %28 = arith.divui %24, %c64 : index
  %29 = arith.divui %25, %c128 : index
  %expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x128x64xi8>
  %expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8>
  %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32>
  %36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %37 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %38 = tensor.empty(%36, %37) : tensor<?x?xi32>
  %39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %40 = arith.divui %36, %c128 : index
  %41 = arith.divui %37, %c128 : index
  %expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x128x128xi32>
  %expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32>
  %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
  %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32>
  %48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x128x?x128xi32>
  %collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %50 : !hal.buffer_view
 }

 // -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %14 = tensor.empty(%12, %13) : tensor<?x?xi8>
  %15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %16 = arith.divui %12, %c128 : index
  %17 = arith.divui %13, %c64 : index
  %expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %dim = tensor.dim %expanded, %c0 : tensor<?x128x?x64xi8>
  %dim_0 = tensor.dim %expanded, %c2 : tensor<?x128x?x64xi8>
  %expanded_1 = tensor.expand_shape %expanded [[0], [1, 2], [3], [4, 5, 6]] output_shape [%dim, 8, 16, %dim_0, 2, 4, 8] : tensor<?x128x?x64xi8> into tensor<?x8x16x?x2x4x8xi8>
  %dim_2 = tensor.dim %11, %c0 : tensor<?x?x128x64xi8>
  %dim_3 = tensor.dim %11, %c1 : tensor<?x?x128x64xi8>
  %18 = tensor.empty(%dim_2, %dim_3) : tensor<?x?x8x16x2x4x8xi8>
  %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d3, d1, d2, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x8x16x?x2x4x8xi8>) outs(%18 : tensor<?x?x8x16x2x4x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x16x2x4x8xi8>
  %20 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<?x?x8x16x2x4x8xi8>) outs(%20 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %24 = tensor.empty(%22, %23) : tensor<?x?x128x64xi8>
  %25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %27 = tensor.empty(%25, %26) : tensor<?x?xi8>
  %28 = linalg.fill ins(%c0_i8 : i8) outs(%27 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_4 = tensor.insert_slice %5 into %28[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %29 = arith.divui %25, %c64 : index
  %30 = arith.divui %26, %c128 : index
  %expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%29, 64, %30, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %dim_6 = tensor.dim %expanded_5, %c0 : tensor<?x64x?x128xi8>
  %dim_7 = tensor.dim %expanded_5, %c2 : tensor<?x64x?x128xi8>
  %expanded_8 = tensor.expand_shape %expanded_5 [[0], [1, 2, 3], [4], [5, 6, 7]] output_shape [%dim_6, 2, 4, 8, %dim_7, 4, 2, 16] : tensor<?x64x?x128xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %dim_9 = tensor.dim %24, %c0 : tensor<?x?x128x64xi8>
  %dim_10 = tensor.dim %24, %c1 : tensor<?x?x128x64xi8>
  %31 = tensor.empty(%dim_9, %dim_10) : tensor<?x?x4x2x16x2x4x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d4, d0, d5, d6, d7, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_8 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%31 : tensor<?x?x4x2x16x2x4x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x16x2x4x8xi8>
  %33 = tensor.empty(%22, %23) : tensor<?x?x4x2x4x16x2x8xi8>
  %34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%33 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %35 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %36 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %37 = tensor.empty(%35, %36) : tensor<?x?x128x128xi32>
  %38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %40 = tensor.empty(%38, %39) : tensor<?x?xi32>
  %41 = linalg.fill ins(%c0_i32 : i32) outs(%40 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_11 = tensor.insert_slice %8 into %41[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %42 = arith.divui %38, %c128 : index
  %43 = arith.divui %39, %c128 : index
  %expanded_12 = tensor.expand_shape %inserted_slice_11 [[0, 1], [2, 3]] output_shape [%42, 128, %43, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %dim_13 = tensor.dim %expanded_12, %c0 : tensor<?x128x?x128xi32>
  %dim_14 = tensor.dim %expanded_12, %c2 : tensor<?x128x?x128xi32>
  %expanded_15 = tensor.expand_shape %expanded_12 [[0], [1, 2, 3], [4], [5, 6, 7]] output_shape [%dim_13, 8, 4, 4, %dim_14, 4, 2, 16] : tensor<?x128x?x128xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %dim_16 = tensor.dim %37, %c0 : tensor<?x?x128x128xi32>
  %dim_17 = tensor.dim %37, %c1 : tensor<?x?x128x128xi32>
  %44 = tensor.empty(%dim_16, %dim_17) : tensor<?x?x8x4x4x4x2x16xi32>
  %45 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d4, d1, d2, d3, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_15 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%44 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %46 = tensor.empty(%35, %36) : tensor<?x?x8x4x2x4x16x4xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%46 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %48 = iree_gpu.multi_mma %21, %34, %47 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %49 = tensor.empty(%35, %36) : tensor<?x?x8x4x4x4x2x16xi32>
  %50 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%48 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%49 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %51 = tensor.empty(%35, %36) : tensor<?x128x?x128xi32>
  %dim_18 = tensor.dim %51, %c0 : tensor<?x128x?x128xi32>
  %dim_19 = tensor.dim %51, %c2 : tensor<?x128x?x128xi32>
  %52 = tensor.empty(%dim_18, %dim_19) : tensor<?x8x4x4x?x4x2x16xi32>
  %53 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%50 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%52 : tensor<?x8x4x4x?x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x8x4x4x?x4x2x16xi32>
  %collapsed = tensor.collapse_shape %53 [[0], [1, 2, 3], [4], [5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x128x?x128xi32>
  %collapsed_20 = tensor.collapse_shape %collapsed [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_20[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %54 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %54 : !hal.buffer_view
 }

 // -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c64 = arith.constant 64 : index
  %c128 = arith.constant 128 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8>
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %14 = tensor.empty(%12, %13) : tensor<?x?xi8>
  %15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %16 = arith.divui %12, %c128 : index
  %17 = arith.divui %13, %c64 : index
  %expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8>
  %dim = tensor.dim %expanded, %c0 : tensor<?x128x?x64xi8>
  %dim_0 = tensor.dim %expanded, %c2 : tensor<?x128x?x64xi8>
  %expanded_1 = tensor.expand_shape %expanded [[0], [1, 2], [3], [4, 5, 6]] output_shape [%dim, 8, 16, %dim_0, 2, 4, 8] : tensor<?x128x?x64xi8> into tensor<?x8x16x?x2x4x8xi8>
  %dim_2 = tensor.dim %11, %c0 : tensor<?x?x128x64xi8>
  %dim_3 = tensor.dim %11, %c1 : tensor<?x?x128x64xi8>
  %18 = tensor.empty(%dim_2, %dim_3) : tensor<?x?x8x16x2x4x8xi8>
  %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d3, d1, d2, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x8x16x?x2x4x8xi8>) outs(%18 : tensor<?x?x8x16x2x4x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x16x2x4x8xi8>
  %20 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<?x?x8x16x2x4x8xi8>) outs(%20 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %24 = tensor.empty(%22, %23) : tensor<?x?x128x64xi8>
  %25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %27 = tensor.empty(%25, %26) : tensor<?x?xi8>
  %28 = linalg.fill ins(%c0_i8 : i8) outs(%27 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_4 = tensor.insert_slice %5 into %28[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %29 = arith.divui %25, %c64 : index
  %30 = arith.divui %26, %c128 : index
  %expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%29, 64, %30, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8>
  %dim_6 = tensor.dim %expanded_5, %c0 : tensor<?x64x?x128xi8>
  %dim_7 = tensor.dim %expanded_5, %c2 : tensor<?x64x?x128xi8>
  %expanded_8 = tensor.expand_shape %expanded_5 [[0], [1, 2, 3], [4], [5, 6, 7]] output_shape [%dim_6, 2, 4, 8, %dim_7, 4, 2, 16] : tensor<?x64x?x128xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %dim_9 = tensor.dim %24, %c0 : tensor<?x?x128x64xi8>
  %dim_10 = tensor.dim %24, %c1 : tensor<?x?x128x64xi8>
  %31 = tensor.empty(%dim_9, %dim_10) : tensor<?x?x4x2x16x2x4x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d4, d0, d5, d6, d7, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_8 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%31 : tensor<?x?x4x2x16x2x4x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x16x2x4x8xi8>
  %33 = tensor.empty(%22, %23) : tensor<?x?x4x2x4x16x2x8xi8>
  %34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%33 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %35 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %36 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %37 = tensor.empty(%35, %36) : tensor<?x?x128x128xi32>
  %38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %40 = tensor.empty(%38, %39) : tensor<?x?xi32>
  %41 = linalg.fill ins(%c0_i32 : i32) outs(%40 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_11 = tensor.insert_slice %8 into %41[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %42 = arith.divui %38, %c128 : index
  %43 = arith.divui %39, %c128 : index
  %expanded_12 = tensor.expand_shape %inserted_slice_11 [[0, 1], [2, 3]] output_shape [%42, 128, %43, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32>
  %dim_13 = tensor.dim %expanded_12, %c0 : tensor<?x128x?x128xi32>
  %dim_14 = tensor.dim %expanded_12, %c2 : tensor<?x128x?x128xi32>
  %expanded_15 = tensor.expand_shape %expanded_12 [[0], [1, 2, 3], [4], [5, 6, 7]] output_shape [%dim_13, 8, 4, 4, %dim_14, 4, 2, 16] : tensor<?x128x?x128xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %dim_16 = tensor.dim %37, %c0 : tensor<?x?x128x128xi32>
  %dim_17 = tensor.dim %37, %c1 : tensor<?x?x128x128xi32>
  %44 = tensor.empty(%dim_16, %dim_17) : tensor<?x?x8x4x4x4x2x16xi32>
  %45 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d4, d1, d2, d3, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_15 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%44 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %46 = tensor.empty(%35, %36) : tensor<?x?x8x4x2x4x16x4xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%46 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %48 = iree_gpu.multi_mma %21, %34, %47 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %49 = tensor.empty(%35, %36) : tensor<?x?x8x4x4x4x2x16xi32>
  %50 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%48 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%49 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %51 = tensor.empty(%35, %36) : tensor<?x128x?x128xi32>
  %dim_18 = tensor.dim %51, %c0 : tensor<?x128x?x128xi32>
  %dim_19 = tensor.dim %51, %c2 : tensor<?x128x?x128xi32>
  %52 = tensor.empty(%dim_18, %dim_19) : tensor<?x8x4x4x?x4x2x16xi32>
  %53 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%50 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%52 : tensor<?x8x4x4x?x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x8x4x4x?x4x2x16xi32>
  %collapsed = tensor.collapse_shape %53 [[0], [1, 2, 3], [4], [5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x128x?x128xi32>
  %collapsed_20 = tensor.collapse_shape %collapsed [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed_20[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %54 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %54 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = tensor.empty(%11, %12) : tensor<?x?xi8>
  %14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%15, 8, 16, %16, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8>
  %17 = tensor.empty(%9, %10) : tensor<?x?x8x16x2x4x8xi8>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d3, d1, d2, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%17 : tensor<?x?x8x16x2x4x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x16x2x4x8xi8>
  %19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %25 = tensor.empty(%23, %24) : tensor<?x?xi8>
  %26 = linalg.fill ins(%c0_i8 : i8) outs(%25 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_0 = tensor.insert_slice %5 into %26[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %28 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%27, 2, 4, 8, %28, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %29 = tensor.empty(%21, %22) : tensor<?x?x4x2x16x2x4x8xi8>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d4, d0, d5, d6, d7, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%29 : tensor<?x?x4x2x16x2x4x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x16x2x4x8xi8>
  %31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%30 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %37 = tensor.empty(%35, %36) : tensor<?x?xi32>
  %38 = linalg.fill ins(%c0_i32 : i32) outs(%37 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_2 = tensor.insert_slice %8 into %38[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %39 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %40 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%39, 8, 4, 4, %40, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %41 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
  %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d4, d1, d2, d3, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%41 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32>
  %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%42 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %48 = tensor.empty(%33, %34) : tensor<?x8x4x4x?x4x2x16xi32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%47 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%48 : tensor<?x8x4x4x?x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x8x4x4x?x4x2x16xi32>
  %collapsed = tensor.collapse_shape %49 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %50 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = tensor.empty(%11, %12) : tensor<?x?xi8>
  %14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8>
  %15 = tensor.empty(%9, %10) : tensor<?x?x8x16x2x4x8xi8>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d3, d1, d2, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x16x2x4x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x16x2x4x8xi8>
  %17 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<?x?x8x16x2x4x8xi8>) outs(%17 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %19 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %20 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %21 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %22 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %23 = tensor.empty(%21, %22) : tensor<?x?xi8>
  %24 = linalg.fill ins(%c0_i8 : i8) outs(%23 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_0 = tensor.insert_slice %5 into %24[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%20, 2, 4, 8, %19, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %25 = tensor.empty(%19, %20) : tensor<?x?x4x2x16x2x4x8xi8>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d4, d0, d5, d6, d7, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%25 : tensor<?x?x4x2x16x2x4x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x16x2x4x8xi8>
  %27 = tensor.empty(%19, %20) : tensor<?x?x4x2x4x16x2x8xi8>
  %28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%26 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%27 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %29 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %30 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %31 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %32 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %33 = tensor.empty(%31, %32) : tensor<?x?xi32>
  %34 = linalg.fill ins(%c0_i32 : i32) outs(%33 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_2 = tensor.insert_slice %8 into %34[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%29, 8, 4, 4, %30, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %35 = tensor.empty(%29, %30) : tensor<?x?x8x4x4x4x2x16xi32>
  %36 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d4, d1, d2, d3, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%35 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %37 = tensor.empty(%29, %30) : tensor<?x?x8x4x2x4x16x4xi32>
  %38 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%36 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%37 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %39 = iree_gpu.multi_mma %18, %28, %38 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %40 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%39 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%35 : tensor<?x?x8x4x4x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x4x4x2x16xi32>
  %41 = tensor.empty(%29, %30) : tensor<?x8x4x4x?x4x2x16xi32>
  %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%40 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%41 : tensor<?x8x4x4x?x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x8x4x4x?x4x2x16xi32>
  %collapsed = tensor.collapse_shape %42 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %43 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %43 : !hal.buffer_view
 }

 // -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = tensor.empty(%11, %12) : tensor<?x?xi8>
  %14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8>
  %15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %21 = tensor.empty(%19, %20) : tensor<?x?xi8>
  %22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %29 = tensor.empty(%27, %28) : tensor<?x?xi32>
  %30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32>
  %35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x8x4x4x?x4x2x16xi32>
  %collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %36 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = tensor.empty(%11, %12) : tensor<?x?xi8>
  %14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8>
  %15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %21 = tensor.empty(%19, %20) : tensor<?x?xi8>
  %22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %29 = tensor.empty(%27, %28) : tensor<?x?xi32>
  %30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32>
  %35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x8x4x4x?x4x2x16xi32>
  %collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %36 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = tensor.empty(%11, %12) : tensor<?x?xi8>
  %14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8>
  %15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %21 = tensor.empty(%19, %20) : tensor<?x?xi8>
  %22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %29 = tensor.empty(%27, %28) : tensor<?x?xi32>
  %30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32>
  %35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x8x4x4x?x4x2x16xi32>
  %collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %36 : !hal.buffer_view
 }

 // -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = tensor.empty(%11, %12) : tensor<?x?xi8>
  %14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8>
  %15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %21 = tensor.empty(%19, %20) : tensor<?x?xi8>
  %22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %29 = tensor.empty(%27, %28) : tensor<?x?xi32>
  %30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32>
  %35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x8x4x4x?x4x2x16xi32>
  %collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %36 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = tensor.empty(%11, %12) : tensor<?x?xi8>
  %14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8>
  %15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %21 = tensor.empty(%19, %20) : tensor<?x?xi8>
  %22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %29 = tensor.empty(%27, %28) : tensor<?x?xi32>
  %30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32>
  %35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x8x4x4x?x4x2x16xi32>
  %collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %36 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = tensor.empty(%11, %12) : tensor<?x?xi8>
  %14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8>
  %15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %21 = tensor.empty(%19, %20) : tensor<?x?xi8>
  %22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %29 = tensor.empty(%27, %28) : tensor<?x?xi32>
  %30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32>
  %35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x8x4x4x?x4x2x16xi32>
  %collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %36 : !hal.buffer_view
 }

 // -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = tensor.empty(%11, %12) : tensor<?x?xi8>
  %14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8>
  %15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %21 = tensor.empty(%19, %20) : tensor<?x?xi8>
  %22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %29 = tensor.empty(%27, %28) : tensor<?x?xi32>
  %30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32>
  %35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x8x4x4x?x4x2x16xi32>
  %collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %36 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = tensor.empty(%11, %12) : tensor<?x?xi8>
  %14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8>
  %15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %21 = tensor.empty(%19, %20) : tensor<?x?xi8>
  %22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %29 = tensor.empty(%27, %28) : tensor<?x?xi32>
  %30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32>
  %35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x8x4x4x?x4x2x16xi32>
  %collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %36 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = tensor.empty(%11, %12) : tensor<?x?xi8>
  %14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8>
  %15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %21 = tensor.empty(%19, %20) : tensor<?x?xi8>
  %22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %29 = tensor.empty(%27, %28) : tensor<?x?xi32>
  %30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32>
  %35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x8x4x4x?x4x2x16xi32>
  %collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %36 : !hal.buffer_view
 }

 // -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = tensor.empty(%11, %12) : tensor<?x?xi8>
  %14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8>
  %15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %21 = tensor.empty(%19, %20) : tensor<?x?xi8>
  %22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %29 = tensor.empty(%27, %28) : tensor<?x?xi32>
  %30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32>
  %35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x8x4x4x?x4x2x16xi32>
  %collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %36 : !hal.buffer_view
 }

 // -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = tensor.empty(%11, %12) : tensor<?x?xi8>
  %14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8>
  %15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %21 = tensor.empty(%19, %20) : tensor<?x?xi8>
  %22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %29 = tensor.empty(%27, %28) : tensor<?x?xi32>
  %30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32>
  %35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x8x4x4x?x4x2x16xi32>
  %collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %36 : !hal.buffer_view
 }

 // -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = tensor.empty(%11, %12) : tensor<?x?xi8>
  %14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8>
  %15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x8x4x16x2x8xi8>
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %21 = tensor.empty(%19, %20) : tensor<?x?xi8>
  %22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) {
  ^bb0(%in: i8, %out: i8):
    linalg.yield %in : i8
  } -> tensor<?x?x4x2x4x16x2x8xi8>
  %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %29 = tensor.empty(%27, %28) : tensor<?x?xi32>
  %30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x?x8x4x2x4x16x4xi32>
  %33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  %34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32>
  %35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  } -> tensor<?x8x4x4x?x4x2x16xi32>
  %collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %36 : !hal.buffer_view
 }

 // -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = tensor.empty(%11, %12) : tensor<?x?xi8>
  %14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8>
  %15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
  %16 = flow.dispatch.region -> (tensor<?x?x8x4x16x2x8xi8>{%9, %10}) {
    %37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    flow.return %37 : tensor<?x?x8x4x16x2x8xi8>
  }
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %21 = tensor.empty(%19, %20) : tensor<?x?xi8>
  %22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8>
  %24 = flow.dispatch.region -> (tensor<?x?x4x2x4x16x2x8xi8>{%17, %18}) {
    %37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    flow.return %37 : tensor<?x?x4x2x4x16x2x8xi8>
  }
  %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %29 = tensor.empty(%27, %28) : tensor<?x?xi32>
  %30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32>
  %32 = flow.dispatch.region -> (tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}) {
    %37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    flow.return %37 : tensor<?x?x8x4x2x4x16x4xi32>
  }
  %33 = flow.dispatch.region -> (tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}) {
    %37 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    flow.return %37 : tensor<?x?x8x4x2x4x16x4xi32>
  }
  %34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32>
  %35 = flow.dispatch.region -> (tensor<?x8x4x4x?x4x2x16xi32>{%25, %26}) {
    %37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x8x4x4x?x4x2x16xi32>
    flow.return %37 : tensor<?x8x4x4x?x4x2x16xi32>
  }
  %collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %36 : !hal.buffer_view
 }

 // -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = tensor.empty(%11, %12) : tensor<?x?xi8>
  %14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8>
  %15 = flow.dispatch.region -> (tensor<?x?x8x4x16x2x8xi8>{%9, %10}) {
    %33 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%33 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    flow.return %34 : tensor<?x?x8x4x16x2x8xi8>
  }
  %16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %18 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %20 = tensor.empty(%18, %19) : tensor<?x?xi8>
  %21 = linalg.fill ins(%c0_i8 : i8) outs(%20 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_0 = tensor.insert_slice %5 into %21[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%17, 2, 4, 8, %16, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %22 = flow.dispatch.region -> (tensor<?x?x4x2x4x16x2x8xi8>{%16, %17}) {
    %33 = tensor.empty(%16, %17) : tensor<?x?x4x2x4x16x2x8xi8>
    %34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%33 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    flow.return %34 : tensor<?x?x4x2x4x16x2x8xi8>
  }
  %23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %27 = tensor.empty(%25, %26) : tensor<?x?xi32>
  %28 = linalg.fill ins(%c0_i32 : i32) outs(%27 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_2 = tensor.insert_slice %8 into %28[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%23, 8, 4, 4, %24, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %29 = flow.dispatch.region -> (tensor<?x?x8x4x2x4x16x4xi32>{%23, %24}) {
    %33 = tensor.empty(%23, %24) : tensor<?x?x8x4x2x4x16x4xi32>
    %34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%33 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    flow.return %34 : tensor<?x?x8x4x2x4x16x4xi32>
  }
  %30 = flow.dispatch.region -> (tensor<?x?x8x4x2x4x16x4xi32>{%23, %24}) {
    %33 = iree_gpu.multi_mma %15, %22, %29 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    flow.return %33 : tensor<?x?x8x4x2x4x16x4xi32>
  }
  %31 = flow.dispatch.region -> (tensor<?x8x4x4x?x4x2x16xi32>{%23, %24}) {
    %33 = tensor.empty(%23, %24) : tensor<?x8x4x4x?x4x2x16xi32>
    %34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%30 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%33 : tensor<?x8x4x4x?x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x8x4x4x?x4x2x16xi32>
    flow.return %34 : tensor<?x8x4x4x?x4x2x16xi32>
  }
  %collapsed = tensor.collapse_shape %31 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %32 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %32 : !hal.buffer_view
 }

 // -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = tensor.empty(%11, %12) : tensor<?x?xi8>
  %14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8>
  %15 = flow.dispatch.region -> (tensor<?x?x8x4x16x2x8xi8>{%9, %10}) {
    %33 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8>
    %34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%33 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    flow.return %34 : tensor<?x?x8x4x16x2x8xi8>
  }
  %16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %18 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %20 = tensor.empty(%18, %19) : tensor<?x?xi8>
  %21 = linalg.fill ins(%c0_i8 : i8) outs(%20 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_0 = tensor.insert_slice %5 into %21[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%17, 2, 4, 8, %16, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %22 = flow.dispatch.region -> (tensor<?x?x4x2x4x16x2x8xi8>{%16, %17}) {
    %33 = tensor.empty(%16, %17) : tensor<?x?x4x2x4x16x2x8xi8>
    %34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%33 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    flow.return %34 : tensor<?x?x4x2x4x16x2x8xi8>
  }
  %23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %27 = tensor.empty(%25, %26) : tensor<?x?xi32>
  %28 = linalg.fill ins(%c0_i32 : i32) outs(%27 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_2 = tensor.insert_slice %8 into %28[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%23, 8, 4, 4, %24, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %29 = flow.dispatch.region -> (tensor<?x?x8x4x2x4x16x4xi32>{%23, %24}) {
    %33 = tensor.empty(%23, %24) : tensor<?x?x8x4x2x4x16x4xi32>
    %34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%33 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    flow.return %34 : tensor<?x?x8x4x2x4x16x4xi32>
  }
  %30 = flow.dispatch.region -> (tensor<?x?x8x4x2x4x16x4xi32>{%23, %24}) {
    %33 = iree_gpu.multi_mma %15, %22, %29 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    flow.return %33 : tensor<?x?x8x4x2x4x16x4xi32>
  }
  %31 = flow.dispatch.region -> (tensor<?x8x4x4x?x4x2x16xi32>{%23, %24}) {
    %33 = tensor.empty(%23, %24) : tensor<?x8x4x4x?x4x2x16xi32>
    %34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%30 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%33 : tensor<?x8x4x4x?x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x8x4x4x?x4x2x16xi32>
    flow.return %34 : tensor<?x8x4x4x?x4x2x16xi32>
  }
  %collapsed = tensor.collapse_shape %31 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %32 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %32 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = tensor.empty(%11, %12) : tensor<?x?xi8>
  %14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8>
  %c0 = arith.constant 0 : index
  %dim = tensor.dim %expanded, %c0 : tensor<?x8x16x?x2x4x8xi8>
  %c3 = arith.constant 3 : index
  %dim_0 = tensor.dim %expanded, %c3 : tensor<?x8x16x?x2x4x8xi8>
  %15 = flow.dispatch.workgroups(%9, %10, %expanded, %dim, %dim_0, %9, %10) : (index, index, tensor<?x8x16x?x2x4x8xi8>{%dim, %dim_0}, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} =
      (%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) {
    %33 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, 8, 16, %arg7, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg6, %arg7} -> tensor<?x8x16x?x2x4x8xi8>
    %34 = tensor.empty(%arg8, %arg9) : tensor<?x?x8x4x16x2x8xi8>
    %35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x8x16x?x2x4x8xi8>) outs(%34 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    flow.dispatch.tensor.store %35, %arg10, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%arg8, %arg9}
    flow.return
  }
  %16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %18 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %20 = tensor.empty(%18, %19) : tensor<?x?xi8>
  %21 = linalg.fill ins(%c0_i8 : i8) outs(%20 : tensor<?x?xi8>) -> tensor<?x?xi8>
  %inserted_slice_1 = tensor.insert_slice %5 into %21[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8>
  %expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%17, 2, 4, 8, %16, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8>
  %c0_3 = arith.constant 0 : index
  %dim_4 = tensor.dim %expanded_2, %c0_3 : tensor<?x2x4x8x?x4x2x16xi8>
  %c4 = arith.constant 4 : index
  %dim_5 = tensor.dim %expanded_2, %c4 : tensor<?x2x4x8x?x4x2x16xi8>
  %22 = flow.dispatch.workgroups(%16, %17, %expanded_2, %dim_4, %dim_5, %16, %17) : (index, index, tensor<?x2x4x8x?x4x2x16xi8>{%dim_4, %dim_5}, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%16, %17} =
      (%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) {
    %33 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, 2, 4, 8, %arg7, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg6, %arg7} -> tensor<?x2x4x8x?x4x2x16xi8>
    %34 = tensor.empty(%arg8, %arg9) : tensor<?x?x4x2x4x16x2x8xi8>
    %35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%34 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    flow.dispatch.tensor.store %35, %arg10, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%arg8, %arg9}
    flow.return
  }
  %23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %27 = tensor.empty(%25, %26) : tensor<?x?xi32>
  %28 = linalg.fill ins(%c0_i32 : i32) outs(%27 : tensor<?x?xi32>) -> tensor<?x?xi32>
  %inserted_slice_6 = tensor.insert_slice %8 into %28[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
  %expanded_7 = tensor.expand_shape %inserted_slice_6 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%23, 8, 4, 4, %24, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32>
  %c0_8 = arith.constant 0 : index
  %dim_9 = tensor.dim %expanded_7, %c0_8 : tensor<?x8x4x4x?x4x2x16xi32>
  %c4_10 = arith.constant 4 : index
  %dim_11 = tensor.dim %expanded_7, %c4_10 : tensor<?x8x4x4x?x4x2x16xi32>
  %29 = flow.dispatch.workgroups(%23, %24, %expanded_7, %dim_9, %dim_11, %23, %24) : (index, index, tensor<?x8x4x4x?x4x2x16xi32>{%dim_9, %dim_11}, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%23, %24} =
      (%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) {
    %33 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, 8, 4, 4, %arg7, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg6, %arg7} -> tensor<?x8x4x4x?x4x2x16xi32>
    %34 = tensor.empty(%arg8, %arg9) : tensor<?x?x8x4x2x4x16x4xi32>
    %35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%34 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    flow.dispatch.tensor.store %35, %arg10, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%arg8, %arg9}
    flow.return
  }
  %30 = flow.dispatch.workgroups(%15, %22, %29, %9, %10, %16, %17, %23, %24) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%16, %17}, tensor<?x?x8x4x2x4x16x4xi32>{%23, %24}, index, index, index, index, index, index) -> %29{%23, %24} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) {
    %33 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, %arg7, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%arg6, %arg7} -> tensor<?x?x8x4x16x2x8xi8>
    %34 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%arg8, %arg9} -> tensor<?x?x4x2x4x16x2x8xi8>
    %35 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg10, %arg11, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%arg10, %arg11} -> tensor<?x?x8x4x2x4x16x4xi32>
    %36 = iree_gpu.multi_mma %33, %34, %35 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    flow.dispatch.tensor.store %36, %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg10, %arg11, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%arg10, %arg11}
    flow.return
  }
  %31 = flow.dispatch.workgroups(%23, %24, %30, %23, %24, %23, %24) : (index, index, tensor<?x?x8x4x2x4x16x4xi32>{%23, %24}, index, index, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%23, %24} =
      (%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) {
    %33 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%arg8, %arg9} -> tensor<?x?x8x4x2x4x16x4xi32>
    %34 = tensor.empty(%arg8, %arg9) : tensor<?x8x4x4x?x4x2x16xi32>
    %35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x8x4x4x?x4x2x16xi32>
    flow.dispatch.tensor.store %35, %arg10, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, 8, 4, 4, %arg9, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9}
    flow.return
  }
  %collapsed = tensor.collapse_shape %31 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32>
  %extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
  %32 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %32 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12}
  %14 = flow.dispatch.workgroups(%2, %13, %0, %1, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index, index, index) -> %13{%11, %12} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index) {
    %54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%arg7, %arg8], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%arg7, %arg8} -> tensor<?x?xi8>
    flow.dispatch.tensor.store %54, %arg4, offsets = [0, 0], sizes = [%arg7, %arg8], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%arg9, %arg10}
    flow.return
  }
  %15 = affine.apply affine_map<()[s0] -> (s0 floordiv 128)>()[%11]
  %16 = affine.apply affine_map<()[s0] -> (s0 floordiv 64)>()[%12]
  %17 = affine.apply affine_map<()[s0] -> (s0 floordiv 128)>()[%11]
  %18 = affine.apply affine_map<()[s0] -> (s0 floordiv 64)>()[%12]
  %19 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%17, %18}
  %20 = flow.dispatch.workgroups(%9, %10, %19, %15, %16, %9, %10) : (index, index, tensor<?x8x16x?x2x4x8xi8>{%15, %16}, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} =
      (%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) {
    %54 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, 8, 16, %arg7, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg6, %arg7} -> tensor<?x8x16x?x2x4x8xi8>
    %55 = tensor.empty(%arg8, %arg9) : tensor<?x?x8x4x16x2x8xi8>
    %56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%54 : tensor<?x8x16x?x2x4x8xi8>) outs(%55 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    flow.dispatch.tensor.store %56, %arg10, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%arg8, %arg9}
    flow.return
  }
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %25 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%23, %24}
  %26 = flow.dispatch.workgroups(%5, %25, %3, %4, %3, %4, %23, %24) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%23, %24}, index, index, index, index, index, index) -> %25{%23, %24} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index) {
    %54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%arg7, %arg8], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%arg7, %arg8} -> tensor<?x?xi8>
    flow.dispatch.tensor.store %54, %arg4, offsets = [0, 0], sizes = [%arg7, %arg8], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%arg9, %arg10}
    flow.return
  }
  %27 = affine.apply affine_map<()[s0] -> (s0 floordiv 64)>()[%23]
  %28 = affine.apply affine_map<()[s0] -> (s0 floordiv 128)>()[%24]
  %29 = affine.apply affine_map<()[s0] -> (s0 floordiv 64)>()[%23]
  %30 = affine.apply affine_map<()[s0] -> (s0 floordiv 128)>()[%24]
  %31 = flow.tensor.reshape %26 : tensor<?x?xi8>{%23, %24} -> tensor<?x2x4x8x?x4x2x16xi8>{%29, %30}
  %32 = flow.dispatch.workgroups(%21, %22, %31, %27, %28, %21, %22) : (index, index, tensor<?x2x4x8x?x4x2x16xi8>{%27, %28}, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%21, %22} =
      (%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) {
    %54 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, 2, 4, 8, %arg7, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg6, %arg7} -> tensor<?x2x4x8x?x4x2x16xi8>
    %55 = tensor.empty(%arg8, %arg9) : tensor<?x?x4x2x4x16x2x8xi8>
    %56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%54 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%55 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    flow.dispatch.tensor.store %56, %arg10, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%arg8, %arg9}
    flow.return
  }
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %37 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%35, %36}
  %38 = flow.dispatch.workgroups(%8, %37, %6, %7, %6, %7, %35, %36) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%35, %36}, index, index, index, index, index, index) -> %37{%35, %36} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index) {
    %54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%arg7, %arg8], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg7, %arg8} -> tensor<?x?xi32>
    flow.dispatch.tensor.store %54, %arg4, offsets = [0, 0], sizes = [%arg7, %arg8], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%arg9, %arg10}
    flow.return
  }
  %39 = affine.apply affine_map<()[s0] -> (s0 floordiv 128)>()[%35]
  %40 = affine.apply affine_map<()[s0] -> (s0 floordiv 128)>()[%36]
  %41 = affine.apply affine_map<()[s0] -> (s0 floordiv 128)>()[%35]
  %42 = affine.apply affine_map<()[s0] -> (s0 floordiv 128)>()[%36]
  %43 = flow.tensor.reshape %38 : tensor<?x?xi32>{%35, %36} -> tensor<?x8x4x4x?x4x2x16xi32>{%41, %42}
  %44 = flow.dispatch.workgroups(%33, %34, %43, %39, %40, %33, %34) : (index, index, tensor<?x8x4x4x?x4x2x16xi32>{%39, %40}, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%33, %34} =
      (%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) {
    %54 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, 8, 4, 4, %arg7, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg6, %arg7} -> tensor<?x8x4x4x?x4x2x16xi32>
    %55 = tensor.empty(%arg8, %arg9) : tensor<?x?x8x4x2x4x16x4xi32>
    %56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%54 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%55 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    flow.dispatch.tensor.store %56, %arg10, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%arg8, %arg9}
    flow.return
  }
  %45 = flow.dispatch.workgroups(%20, %32, %44, %9, %10, %21, %22, %33, %34) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%21, %22}, tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index, index, index, index, index) -> %44{%33, %34} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) {
    %54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, %arg7, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%arg6, %arg7} -> tensor<?x?x8x4x16x2x8xi8>
    %55 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%arg8, %arg9} -> tensor<?x?x4x2x4x16x2x8xi8>
    %56 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg10, %arg11, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%arg10, %arg11} -> tensor<?x?x8x4x2x4x16x4xi32>
    %57 = iree_gpu.multi_mma %54, %55, %56 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    flow.dispatch.tensor.store %57, %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg10, %arg11, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%arg10, %arg11}
    flow.return
  }
  %46 = flow.dispatch.workgroups(%33, %34, %45, %33, %34, %33, %34) : (index, index, tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} =
      (%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) {
    %54 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%arg8, %arg9} -> tensor<?x?x8x4x2x4x16x4xi32>
    %55 = tensor.empty(%arg8, %arg9) : tensor<?x8x4x4x?x4x2x16xi32>
    %56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%54 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%55 : tensor<?x8x4x4x?x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x8x4x4x?x4x2x16xi32>
    flow.dispatch.tensor.store %56, %arg10, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, 8, 4, 4, %arg9, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9}
    flow.return
  }
  %47 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%33]
  %48 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%34]
  %49 = flow.tensor.reshape %46 : tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} -> tensor<?x?xi32>{%47, %48}
  %50 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%33]
  %51 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%34]
  %52 = flow.dispatch.workgroups(%49, %6, %7, %50, %51, %6, %7) : (tensor<?x?xi32>{%50, %51}, index, index, index, index, index, index) -> tensor<?x?xi32>{%6, %7} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) {
    %54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%arg8, %arg9], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg6, %arg7} -> tensor<?x?xi32>
    flow.dispatch.tensor.store %54, %arg10, offsets = [0, 0], sizes = [%arg8, %arg9], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%arg8, %arg9}
    flow.return
  }
  %53 = hal.tensor.export %52 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %53 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12}
  %14 = flow.dispatch.workgroups(%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
    %54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%arg5, %arg6], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%arg5, %arg6} -> tensor<?x?xi8>
    flow.dispatch.tensor.store %54, %arg4, offsets = [0, 0], sizes = [%arg5, %arg6], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%arg7, %arg8}
    flow.return
  }
  %15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %19 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%17, %18}
  %20 = flow.dispatch.workgroups(%19, %15, %16, %9, %10) : (tensor<?x8x16x?x2x4x8xi8>{%15, %16}, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) {
    %54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%arg4, 8, 16, %arg5, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg4, %arg5} -> tensor<?x8x16x?x2x4x8xi8>
    %55 = tensor.empty(%arg6, %arg7) : tensor<?x?x8x4x16x2x8xi8>
    %56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%54 : tensor<?x8x16x?x2x4x8xi8>) outs(%55 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    flow.dispatch.tensor.store %56, %arg8, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, %arg7, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%arg6, %arg7}
    flow.return
  }
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %25 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%23, %24}
  %26 = flow.dispatch.workgroups(%5, %25, %3, %4, %23, %24) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%23, %24}, index, index, index, index) -> %25{%23, %24} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
    %54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%arg5, %arg6], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%arg5, %arg6} -> tensor<?x?xi8>
    flow.dispatch.tensor.store %54, %arg4, offsets = [0, 0], sizes = [%arg5, %arg6], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%arg7, %arg8}
    flow.return
  }
  %27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %28 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %29 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %30 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %31 = flow.tensor.reshape %26 : tensor<?x?xi8>{%23, %24} -> tensor<?x2x4x8x?x4x2x16xi8>{%29, %30}
  %32 = flow.dispatch.workgroups(%31, %27, %28, %21, %22) : (tensor<?x2x4x8x?x4x2x16xi8>{%27, %28}, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%21, %22} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) {
    %54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg4, 2, 4, 8, %arg5, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg4, %arg5} -> tensor<?x2x4x8x?x4x2x16xi8>
    %55 = tensor.empty(%arg6, %arg7) : tensor<?x?x4x2x4x16x2x8xi8>
    %56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%54 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%55 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    flow.dispatch.tensor.store %56, %arg8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, %arg7, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%arg6, %arg7}
    flow.return
  }
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %37 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%35, %36}
  %38 = flow.dispatch.workgroups(%8, %37, %6, %7, %35, %36) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%35, %36}, index, index, index, index) -> %37{%35, %36} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
    %54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%arg5, %arg6], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg5, %arg6} -> tensor<?x?xi32>
    flow.dispatch.tensor.store %54, %arg4, offsets = [0, 0], sizes = [%arg5, %arg6], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%arg7, %arg8}
    flow.return
  }
  %39 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %40 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %41 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %42 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %43 = flow.tensor.reshape %38 : tensor<?x?xi32>{%35, %36} -> tensor<?x8x4x4x?x4x2x16xi32>{%41, %42}
  %44 = flow.dispatch.workgroups(%43, %39, %40, %33, %34) : (tensor<?x8x4x4x?x4x2x16xi32>{%39, %40}, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%33, %34} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) {
    %54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg4, 8, 4, 4, %arg5, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg4, %arg5} -> tensor<?x8x4x4x?x4x2x16xi32>
    %55 = tensor.empty(%arg6, %arg7) : tensor<?x?x8x4x2x4x16x4xi32>
    %56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%54 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%55 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    flow.dispatch.tensor.store %56, %arg8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, %arg7, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%arg6, %arg7}
    flow.return
  }
  %45 = flow.dispatch.workgroups(%20, %32, %44, %9, %10, %21, %22, %33, %34) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%21, %22}, tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index, index, index, index, index) -> %44{%33, %34} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) {
    %54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, %arg7, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%arg6, %arg7} -> tensor<?x?x8x4x16x2x8xi8>
    %55 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%arg8, %arg9} -> tensor<?x?x4x2x4x16x2x8xi8>
    %56 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg10, %arg11, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%arg10, %arg11} -> tensor<?x?x8x4x2x4x16x4xi32>
    %57 = iree_gpu.multi_mma %54, %55, %56 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    flow.dispatch.tensor.store %57, %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg10, %arg11, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%arg10, %arg11}
    flow.return
  }
  %46 = flow.dispatch.workgroups(%45, %33, %34) : (tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) {
    %54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg4, %arg5, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%arg4, %arg5} -> tensor<?x?x8x4x2x4x16x4xi32>
    %55 = tensor.empty(%arg4, %arg5) : tensor<?x8x4x4x?x4x2x16xi32>
    %56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%54 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%55 : tensor<?x8x4x4x?x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x8x4x4x?x4x2x16xi32>
    flow.dispatch.tensor.store %56, %arg6, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg4, 8, 4, 4, %arg5, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg4, %arg5}
    flow.return
  }
  %47 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %48 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %49 = flow.tensor.reshape %46 : tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} -> tensor<?x?xi32>{%47, %48}
  %50 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %51 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %52 = flow.dispatch.workgroups(%49, %50, %51, %6, %7) : (tensor<?x?xi32>{%50, %51}, index, index, index, index) -> tensor<?x?xi32>{%6, %7} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) {
    %54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%arg6, %arg7], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg4, %arg5} -> tensor<?x?xi32>
    flow.dispatch.tensor.store %54, %arg8, offsets = [0, 0], sizes = [%arg6, %arg7], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%arg6, %arg7}
    flow.return
  }
  %53 = hal.tensor.export %52 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %53 : !hal.buffer_view
 }

 // -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12}
  %14 = flow.dispatch.workgroups[%0, %1, %11, %12](%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
    %54 = flow.dispatch.workload.ordinal %arg5, 0 : index
    %55 = flow.dispatch.workload.ordinal %arg6, 1 : index
    %56 = flow.dispatch.workload.ordinal %arg7, 2 : index
    %57 = flow.dispatch.workload.ordinal %arg8, 3 : index
    %58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%54, %55} -> tensor<?x?xi8>
    flow.dispatch.tensor.store %58, %arg4, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%56, %57}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %19 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%17, %18}
  %20 = flow.dispatch.workgroups[%15, %16, %9, %10](%19, %15, %16, %9, %10) : (tensor<?x8x16x?x2x4x8xi8>{%15, %16}, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) {
    %54 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %55 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %56 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %57 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%54, 8, 16, %55, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%54, %55} -> tensor<?x8x16x?x2x4x8xi8>
    %59 = tensor.empty(%56, %57) : tensor<?x?x8x4x16x2x8xi8>
    %60 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%58 : tensor<?x8x16x?x2x4x8xi8>) outs(%59 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    flow.dispatch.tensor.store %60, %arg8, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%56, %57}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %25 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%23, %24}
  %26 = flow.dispatch.workgroups[%3, %4, %23, %24](%5, %25, %3, %4, %23, %24) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%23, %24}, index, index, index, index) -> %25{%23, %24} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
    %54 = flow.dispatch.workload.ordinal %arg5, 0 : index
    %55 = flow.dispatch.workload.ordinal %arg6, 1 : index
    %56 = flow.dispatch.workload.ordinal %arg7, 2 : index
    %57 = flow.dispatch.workload.ordinal %arg8, 3 : index
    %58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%54, %55} -> tensor<?x?xi8>
    flow.dispatch.tensor.store %58, %arg4, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%56, %57}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %28 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %29 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %30 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %31 = flow.tensor.reshape %26 : tensor<?x?xi8>{%23, %24} -> tensor<?x2x4x8x?x4x2x16xi8>{%29, %30}
  %32 = flow.dispatch.workgroups[%27, %28, %21, %22](%31, %27, %28, %21, %22) : (tensor<?x2x4x8x?x4x2x16xi8>{%27, %28}, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%21, %22} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) {
    %54 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %55 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %56 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %57 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, 2, 4, 8, %55, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%54, %55} -> tensor<?x2x4x8x?x4x2x16xi8>
    %59 = tensor.empty(%56, %57) : tensor<?x?x4x2x4x16x2x8xi8>
    %60 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%58 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%59 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    flow.dispatch.tensor.store %60, %arg8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%56, %57}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %37 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%35, %36}
  %38 = flow.dispatch.workgroups[%6, %7, %35, %36](%8, %37, %6, %7, %35, %36) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%35, %36}, index, index, index, index) -> %37{%35, %36} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
    %54 = flow.dispatch.workload.ordinal %arg5, 0 : index
    %55 = flow.dispatch.workload.ordinal %arg6, 1 : index
    %56 = flow.dispatch.workload.ordinal %arg7, 2 : index
    %57 = flow.dispatch.workload.ordinal %arg8, 3 : index
    %58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%54, %55} -> tensor<?x?xi32>
    flow.dispatch.tensor.store %58, %arg4, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%56, %57}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %39 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %40 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %41 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %42 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %43 = flow.tensor.reshape %38 : tensor<?x?xi32>{%35, %36} -> tensor<?x8x4x4x?x4x2x16xi32>{%41, %42}
  %44 = flow.dispatch.workgroups[%39, %40, %33, %34](%43, %39, %40, %33, %34) : (tensor<?x8x4x4x?x4x2x16xi32>{%39, %40}, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%33, %34} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) {
    %54 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %55 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %56 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %57 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, 8, 4, 4, %55, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%54, %55} -> tensor<?x8x4x4x?x4x2x16xi32>
    %59 = tensor.empty(%56, %57) : tensor<?x?x8x4x2x4x16x4xi32>
    %60 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%58 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%59 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    flow.dispatch.tensor.store %60, %arg8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%56, %57}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %45 = flow.dispatch.workgroups[%9, %10, %21, %22, %33, %34](%20, %32, %44, %9, %10, %21, %22, %33, %34) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%21, %22}, tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index, index, index, index, index) -> %44{%33, %34} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) {
    %54 = flow.dispatch.workload.ordinal %arg6, 0 : index
    %55 = flow.dispatch.workload.ordinal %arg7, 1 : index
    %56 = flow.dispatch.workload.ordinal %arg8, 2 : index
    %57 = flow.dispatch.workload.ordinal %arg9, 3 : index
    %58 = flow.dispatch.workload.ordinal %arg10, 4 : index
    %59 = flow.dispatch.workload.ordinal %arg11, 5 : index
    %60 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%54, %55, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%54, %55} -> tensor<?x?x8x4x16x2x8xi8>
    %61 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%56, %57} -> tensor<?x?x4x2x4x16x2x8xi8>
    %62 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, %59, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%58, %59} -> tensor<?x?x8x4x2x4x16x4xi32>
    %63 = iree_gpu.multi_mma %60, %61, %62 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    flow.dispatch.tensor.store %63, %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, %59, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%58, %59}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
    flow.return %x, %y, %z : index, index, index
  }
  %46 = flow.dispatch.workgroups[%33, %34](%45, %33, %34) : (tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) {
    %54 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %55 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %56 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, %55, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%54, %55} -> tensor<?x?x8x4x2x4x16x4xi32>
    %57 = tensor.empty(%54, %55) : tensor<?x8x4x4x?x4x2x16xi32>
    %58 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%56 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%57 : tensor<?x8x4x4x?x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x8x4x4x?x4x2x16xi32>
    flow.dispatch.tensor.store %58, %arg6, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, 8, 4, 4, %55, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%54, %55}
    flow.return
  } count(%arg3: index, %arg4: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4
    flow.return %x, %y, %z : index, index, index
  }
  %47 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %48 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %49 = flow.tensor.reshape %46 : tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} -> tensor<?x?xi32>{%47, %48}
  %50 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %51 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %52 = flow.dispatch.workgroups[%50, %51, %6, %7](%49, %50, %51, %6, %7) : (tensor<?x?xi32>{%50, %51}, index, index, index, index) -> tensor<?x?xi32>{%6, %7} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) {
    %54 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %55 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %56 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %57 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%54, %55} -> tensor<?x?xi32>
    flow.dispatch.tensor.store %58, %arg8, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%56, %57}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %53 = hal.tensor.export %52 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %53 : !hal.buffer_view
 }

 // -----// IR Dump After VerifyInputLegalityPass (iree-verify-input-legality) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map3 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #map4 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>
 #map5 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>
 #map6 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>
 #map7 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>
 #map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>
 #map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>
 #map10 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map11 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map12 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = affine.apply #map1()[%1]
    %11 = affine.apply #map2()[%0]
    %12 = affine.apply #map3()[%1]
    %13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12}
    %14 = flow.dispatch.workgroups[%0, %1, %11, %12](%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
      %54 = flow.dispatch.workload.ordinal %arg5, 0 : index
      %55 = flow.dispatch.workload.ordinal %arg6, 1 : index
      %56 = flow.dispatch.workload.ordinal %arg7, 2 : index
      %57 = flow.dispatch.workload.ordinal %arg8, 3 : index
      %58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%54, %55} -> tensor<?x?xi8>
      flow.dispatch.tensor.store %58, %arg4, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%56, %57}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
      flow.return %x, %y, %z : index, index, index
    }
    %15 = affine.apply #map()[%0]
    %16 = affine.apply #map1()[%1]
    %17 = affine.apply #map()[%0]
    %18 = affine.apply #map1()[%1]
    %19 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%17, %18}
    %20 = flow.dispatch.workgroups[%15, %16, %9, %10](%19, %15, %16, %9, %10) : (tensor<?x8x16x?x2x4x8xi8>{%15, %16}, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) {
      %54 = flow.dispatch.workload.ordinal %arg4, 0 : index
      %55 = flow.dispatch.workload.ordinal %arg5, 1 : index
      %56 = flow.dispatch.workload.ordinal %arg6, 2 : index
      %57 = flow.dispatch.workload.ordinal %arg7, 3 : index
      %58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%54, 8, 16, %55, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%54, %55} -> tensor<?x8x16x?x2x4x8xi8>
      %59 = tensor.empty(%56, %57) : tensor<?x?x8x4x16x2x8xi8>
      %60 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%58 : tensor<?x8x16x?x2x4x8xi8>) outs(%59 : tensor<?x?x8x4x16x2x8xi8>) {
      ^bb0(%in: i8, %out: i8):
        linalg.yield %in : i8
      } -> tensor<?x?x8x4x16x2x8xi8>
      flow.dispatch.tensor.store %60, %arg8, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%56, %57}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
      flow.return %x, %y, %z : index, index, index
    }
    %21 = affine.apply #map()[%4]
    %22 = affine.apply #map1()[%3]
    %23 = affine.apply #map3()[%3]
    %24 = affine.apply #map2()[%4]
    %25 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%23, %24}
    %26 = flow.dispatch.workgroups[%3, %4, %23, %24](%5, %25, %3, %4, %23, %24) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%23, %24}, index, index, index, index) -> %25{%23, %24} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
      %54 = flow.dispatch.workload.ordinal %arg5, 0 : index
      %55 = flow.dispatch.workload.ordinal %arg6, 1 : index
      %56 = flow.dispatch.workload.ordinal %arg7, 2 : index
      %57 = flow.dispatch.workload.ordinal %arg8, 3 : index
      %58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%54, %55} -> tensor<?x?xi8>
      flow.dispatch.tensor.store %58, %arg4, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%56, %57}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
      flow.return %x, %y, %z : index, index, index
    }
    %27 = affine.apply #map1()[%3]
    %28 = affine.apply #map()[%4]
    %29 = affine.apply #map1()[%3]
    %30 = affine.apply #map()[%4]
    %31 = flow.tensor.reshape %26 : tensor<?x?xi8>{%23, %24} -> tensor<?x2x4x8x?x4x2x16xi8>{%29, %30}
    %32 = flow.dispatch.workgroups[%27, %28, %21, %22](%31, %27, %28, %21, %22) : (tensor<?x2x4x8x?x4x2x16xi8>{%27, %28}, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%21, %22} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) {
      %54 = flow.dispatch.workload.ordinal %arg4, 0 : index
      %55 = flow.dispatch.workload.ordinal %arg5, 1 : index
      %56 = flow.dispatch.workload.ordinal %arg6, 2 : index
      %57 = flow.dispatch.workload.ordinal %arg7, 3 : index
      %58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, 2, 4, 8, %55, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%54, %55} -> tensor<?x2x4x8x?x4x2x16xi8>
      %59 = tensor.empty(%56, %57) : tensor<?x?x4x2x4x16x2x8xi8>
      %60 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%58 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%59 : tensor<?x?x4x2x4x16x2x8xi8>) {
      ^bb0(%in: i8, %out: i8):
        linalg.yield %in : i8
      } -> tensor<?x?x4x2x4x16x2x8xi8>
      flow.dispatch.tensor.store %60, %arg8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%56, %57}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
      flow.return %x, %y, %z : index, index, index
    }
    %33 = affine.apply #map()[%6]
    %34 = affine.apply #map()[%7]
    %35 = affine.apply #map2()[%6]
    %36 = affine.apply #map2()[%7]
    %37 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%35, %36}
    %38 = flow.dispatch.workgroups[%6, %7, %35, %36](%8, %37, %6, %7, %35, %36) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%35, %36}, index, index, index, index) -> %37{%35, %36} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
      %54 = flow.dispatch.workload.ordinal %arg5, 0 : index
      %55 = flow.dispatch.workload.ordinal %arg6, 1 : index
      %56 = flow.dispatch.workload.ordinal %arg7, 2 : index
      %57 = flow.dispatch.workload.ordinal %arg8, 3 : index
      %58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%54, %55} -> tensor<?x?xi32>
      flow.dispatch.tensor.store %58, %arg4, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%56, %57}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
      flow.return %x, %y, %z : index, index, index
    }
    %39 = affine.apply #map()[%6]
    %40 = affine.apply #map()[%7]
    %41 = affine.apply #map()[%6]
    %42 = affine.apply #map()[%7]
    %43 = flow.tensor.reshape %38 : tensor<?x?xi32>{%35, %36} -> tensor<?x8x4x4x?x4x2x16xi32>{%41, %42}
    %44 = flow.dispatch.workgroups[%39, %40, %33, %34](%43, %39, %40, %33, %34) : (tensor<?x8x4x4x?x4x2x16xi32>{%39, %40}, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%33, %34} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) {
      %54 = flow.dispatch.workload.ordinal %arg4, 0 : index
      %55 = flow.dispatch.workload.ordinal %arg5, 1 : index
      %56 = flow.dispatch.workload.ordinal %arg6, 2 : index
      %57 = flow.dispatch.workload.ordinal %arg7, 3 : index
      %58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, 8, 4, 4, %55, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%54, %55} -> tensor<?x8x4x4x?x4x2x16xi32>
      %59 = tensor.empty(%56, %57) : tensor<?x?x8x4x2x4x16x4xi32>
      %60 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%58 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%59 : tensor<?x?x8x4x2x4x16x4xi32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      } -> tensor<?x?x8x4x2x4x16x4xi32>
      flow.dispatch.tensor.store %60, %arg8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%56, %57}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
      flow.return %x, %y, %z : index, index, index
    }
    %45 = flow.dispatch.workgroups[%9, %10, %21, %22, %33, %34](%20, %32, %44, %9, %10, %21, %22, %33, %34) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%21, %22}, tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index, index, index, index, index) -> %44{%33, %34} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) {
      %54 = flow.dispatch.workload.ordinal %arg6, 0 : index
      %55 = flow.dispatch.workload.ordinal %arg7, 1 : index
      %56 = flow.dispatch.workload.ordinal %arg8, 2 : index
      %57 = flow.dispatch.workload.ordinal %arg9, 3 : index
      %58 = flow.dispatch.workload.ordinal %arg10, 4 : index
      %59 = flow.dispatch.workload.ordinal %arg11, 5 : index
      %60 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%54, %55, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%54, %55} -> tensor<?x?x8x4x16x2x8xi8>
      %61 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%56, %57} -> tensor<?x?x4x2x4x16x2x8xi8>
      %62 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, %59, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%58, %59} -> tensor<?x?x8x4x2x4x16x4xi32>
      %63 = iree_gpu.multi_mma %60, %61, %62 {indexing_maps = [#map10, #map11, #map12], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
      flow.dispatch.tensor.store %63, %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, %59, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%58, %59}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
      flow.return %x, %y, %z : index, index, index
    }
    %46 = flow.dispatch.workgroups[%33, %34](%45, %33, %34) : (tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) {
      %54 = flow.dispatch.workload.ordinal %arg4, 0 : index
      %55 = flow.dispatch.workload.ordinal %arg5, 1 : index
      %56 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, %55, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%54, %55} -> tensor<?x?x8x4x2x4x16x4xi32>
      %57 = tensor.empty(%54, %55) : tensor<?x8x4x4x?x4x2x16xi32>
      %58 = linalg.generic {indexing_maps = [#map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%56 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%57 : tensor<?x8x4x4x?x4x2x16xi32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      } -> tensor<?x8x4x4x?x4x2x16xi32>
      flow.dispatch.tensor.store %58, %arg6, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, 8, 4, 4, %55, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%54, %55}
      flow.return
    } count(%arg3: index, %arg4: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4
      flow.return %x, %y, %z : index, index, index
    }
    %47 = affine.apply #map2()[%6]
    %48 = affine.apply #map2()[%7]
    %49 = flow.tensor.reshape %46 : tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} -> tensor<?x?xi32>{%47, %48}
    %50 = affine.apply #map2()[%6]
    %51 = affine.apply #map2()[%7]
    %52 = flow.dispatch.workgroups[%50, %51, %6, %7](%49, %50, %51, %6, %7) : (tensor<?x?xi32>{%50, %51}, index, index, index, index) -> tensor<?x?xi32>{%6, %7} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) {
      %54 = flow.dispatch.workload.ordinal %arg4, 0 : index
      %55 = flow.dispatch.workload.ordinal %arg5, 1 : index
      %56 = flow.dispatch.workload.ordinal %arg6, 2 : index
      %57 = flow.dispatch.workload.ordinal %arg7, 3 : index
      %58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%54, %55} -> tensor<?x?xi32>
      flow.dispatch.tensor.store %58, %arg8, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%56, %57}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
      flow.return %x, %y, %z : index, index, index
    }
    %53 = hal.tensor.export %52 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %53 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12}
  %14 = flow.dispatch.workgroups[%0, %1, %11, %12](%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
    %54 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%arg5, %arg6}
    %55 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%arg7, %arg8}
    %56 = flow.dispatch.workload.ordinal %arg5, 0 : index
    %57 = flow.dispatch.workload.ordinal %arg6, 1 : index
    %58 = flow.dispatch.workload.ordinal %arg7, 2 : index
    %59 = flow.dispatch.workload.ordinal %arg8, 3 : index
    %60 = flow.dispatch.tensor.load %54, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%56, %57} -> tensor<?x?xi8>
    flow.dispatch.tensor.store %60, %55, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%58, %59}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %19 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%17, %18}
  %20 = flow.dispatch.workgroups[%15, %16, %9, %10](%19, %15, %16, %9, %10, %17, %18) : (tensor<?x8x16x?x2x4x8xi8>{%15, %16}, index, index, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) {
    %54 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg8, %arg9}
    %55 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%arg6, %arg7}
    %56 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %57 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %58 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %59 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %60 = flow.dispatch.tensor.load %54, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%56, 8, 16, %57, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%56, %57} -> tensor<?x8x16x?x2x4x8xi8>
    %61 = tensor.empty(%58, %59) : tensor<?x?x8x4x16x2x8xi8>
    %62 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%60 : tensor<?x8x16x?x2x4x8xi8>) outs(%61 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    flow.dispatch.tensor.store %62, %55, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%58, %59, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%58, %59}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %25 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%23, %24}
  %26 = flow.dispatch.workgroups[%3, %4, %23, %24](%5, %25, %3, %4, %23, %24) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%23, %24}, index, index, index, index) -> %25{%23, %24} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
    %54 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%arg5, %arg6}
    %55 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%arg7, %arg8}
    %56 = flow.dispatch.workload.ordinal %arg5, 0 : index
    %57 = flow.dispatch.workload.ordinal %arg6, 1 : index
    %58 = flow.dispatch.workload.ordinal %arg7, 2 : index
    %59 = flow.dispatch.workload.ordinal %arg8, 3 : index
    %60 = flow.dispatch.tensor.load %54, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%56, %57} -> tensor<?x?xi8>
    flow.dispatch.tensor.store %60, %55, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%58, %59}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %28 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %29 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %30 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %31 = flow.tensor.reshape %26 : tensor<?x?xi8>{%23, %24} -> tensor<?x2x4x8x?x4x2x16xi8>{%29, %30}
  %32 = flow.dispatch.workgroups[%27, %28, %21, %22](%31, %27, %28, %21, %22, %29, %30) : (tensor<?x2x4x8x?x4x2x16xi8>{%27, %28}, index, index, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%21, %22} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) {
    %54 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg8, %arg9}
    %55 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%arg6, %arg7}
    %56 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %57 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %58 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %59 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %60 = flow.dispatch.tensor.load %54, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, 2, 4, 8, %57, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%56, %57} -> tensor<?x2x4x8x?x4x2x16xi8>
    %61 = tensor.empty(%58, %59) : tensor<?x?x4x2x4x16x2x8xi8>
    %62 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%60 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%61 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    flow.dispatch.tensor.store %62, %55, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, %59, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%58, %59}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %37 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%35, %36}
  %38 = flow.dispatch.workgroups[%6, %7, %35, %36](%8, %37, %6, %7, %35, %36) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%35, %36}, index, index, index, index) -> %37{%35, %36} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
    %54 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg5, %arg6}
    %55 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%arg7, %arg8}
    %56 = flow.dispatch.workload.ordinal %arg5, 0 : index
    %57 = flow.dispatch.workload.ordinal %arg6, 1 : index
    %58 = flow.dispatch.workload.ordinal %arg7, 2 : index
    %59 = flow.dispatch.workload.ordinal %arg8, 3 : index
    %60 = flow.dispatch.tensor.load %54, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%56, %57} -> tensor<?x?xi32>
    flow.dispatch.tensor.store %60, %55, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%58, %59}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %39 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %40 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %41 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %42 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %43 = flow.tensor.reshape %38 : tensor<?x?xi32>{%35, %36} -> tensor<?x8x4x4x?x4x2x16xi32>{%41, %42}
  %44 = flow.dispatch.workgroups[%39, %40, %33, %34](%43, %39, %40, %33, %34, %41, %42) : (tensor<?x8x4x4x?x4x2x16xi32>{%39, %40}, index, index, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%33, %34} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) {
    %54 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9}
    %55 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%arg6, %arg7}
    %56 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %57 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %58 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %59 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %60 = flow.dispatch.tensor.load %54, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, 8, 4, 4, %57, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%56, %57} -> tensor<?x8x4x4x?x4x2x16xi32>
    %61 = tensor.empty(%58, %59) : tensor<?x?x8x4x2x4x16x4xi32>
    %62 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%60 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%61 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    flow.dispatch.tensor.store %62, %55, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, %59, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%58, %59}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %45 = flow.dispatch.workgroups[%9, %10, %21, %22, %33, %34](%20, %32, %44, %9, %10, %21, %22, %33, %34) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%21, %22}, tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index, index, index, index, index) -> %44{%33, %34} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) {
    %54 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%arg6, %arg7}
    %55 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%arg8, %arg9}
    %56 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%arg10, %arg11}
    %57 = flow.dispatch.workload.ordinal %arg6, 0 : index
    %58 = flow.dispatch.workload.ordinal %arg7, 1 : index
    %59 = flow.dispatch.workload.ordinal %arg8, 2 : index
    %60 = flow.dispatch.workload.ordinal %arg9, 3 : index
    %61 = flow.dispatch.workload.ordinal %arg10, 4 : index
    %62 = flow.dispatch.workload.ordinal %arg11, 5 : index
    %63 = flow.dispatch.tensor.load %54, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%57, %58, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%57, %58} -> tensor<?x?x8x4x16x2x8xi8>
    %64 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%59, %60, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%59, %60} -> tensor<?x?x4x2x4x16x2x8xi8>
    %65 = flow.dispatch.tensor.load %56, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%61, %62, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%61, %62} -> tensor<?x?x8x4x2x4x16x4xi32>
    %66 = iree_gpu.multi_mma %63, %64, %65 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%61, %62, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%61, %62}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
    flow.return %x, %y, %z : index, index, index
  }
  %46 = flow.dispatch.workgroups[%33, %34](%45, %33, %34) : (tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) {
    %54 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%arg4, %arg5}
    %55 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg4, %arg5}
    %56 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %57 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %58 = flow.dispatch.tensor.load %54, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%56, %57} -> tensor<?x?x8x4x2x4x16x4xi32>
    %59 = tensor.empty(%56, %57) : tensor<?x8x4x4x?x4x2x16xi32>
    %60 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%58 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%59 : tensor<?x8x4x4x?x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x8x4x4x?x4x2x16xi32>
    flow.dispatch.tensor.store %60, %55, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, 8, 4, 4, %57, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%56, %57}
    flow.return
  } count(%arg3: index, %arg4: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4
    flow.return %x, %y, %z : index, index, index
  }
  %47 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %48 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %49 = flow.tensor.reshape %46 : tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} -> tensor<?x?xi32>{%47, %48}
  %50 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %51 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %52 = flow.dispatch.workgroups[%50, %51, %6, %7](%49, %50, %51, %6, %7, %47, %48) : (tensor<?x?xi32>{%50, %51}, index, index, index, index, index, index) -> tensor<?x?xi32>{%6, %7} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) {
    %54 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg8, %arg9}
    %55 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%arg6, %arg7}
    %56 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %57 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %58 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %59 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %60 = flow.dispatch.tensor.load %54, offsets = [0, 0], sizes = [%58, %59], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%56, %57} -> tensor<?x?xi32>
    flow.dispatch.tensor.store %60, %55, offsets = [0, 0], sizes = [%58, %59], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%58, %59}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %53 = hal.tensor.export %52 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %53 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12}
  %14 = flow.dispatch.workgroups[%0, %1, %11, %12](%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
    %54 = flow.dispatch.workload.ordinal %arg5, 0 : index
    %55 = flow.dispatch.workload.ordinal %arg6, 1 : index
    %56 = flow.dispatch.workload.ordinal %arg7, 2 : index
    %57 = flow.dispatch.workload.ordinal %arg8, 3 : index
    %58 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%54, %55}
    %59 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%56, %57}
    %60 = flow.dispatch.tensor.load %58, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%54, %55} -> tensor<?x?xi8>
    flow.dispatch.tensor.store %60, %59, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%56, %57}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %19 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%17, %18}
  %20 = flow.dispatch.workgroups[%15, %16, %9, %10](%19, %15, %16, %9, %10, %17, %18) : (tensor<?x8x16x?x2x4x8xi8>{%15, %16}, index, index, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) {
    %54 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %55 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %56 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg8, %arg9}
    %57 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%54, %55}
    %58 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %59 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %60 = flow.dispatch.tensor.load %56, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%58, 8, 16, %59, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg8, %arg9} -> tensor<?x8x16x?x2x4x8xi8>
    %61 = tensor.empty(%54, %55) : tensor<?x?x8x4x16x2x8xi8>
    %62 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%60 : tensor<?x8x16x?x2x4x8xi8>) outs(%61 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    flow.dispatch.tensor.store %62, %57, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%54, %55, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%54, %55}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %23 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %25 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%23, %24}
  %26 = flow.dispatch.workgroups[%3, %4, %23, %24](%5, %25, %3, %4, %23, %24) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%23, %24}, index, index, index, index) -> %25{%23, %24} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
    %54 = flow.dispatch.workload.ordinal %arg5, 0 : index
    %55 = flow.dispatch.workload.ordinal %arg6, 1 : index
    %56 = flow.dispatch.workload.ordinal %arg7, 2 : index
    %57 = flow.dispatch.workload.ordinal %arg8, 3 : index
    %58 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%54, %55}
    %59 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%56, %57}
    %60 = flow.dispatch.tensor.load %58, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%54, %55} -> tensor<?x?xi8>
    flow.dispatch.tensor.store %60, %59, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%56, %57}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %28 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %29 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %30 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %31 = flow.tensor.reshape %26 : tensor<?x?xi8>{%23, %24} -> tensor<?x2x4x8x?x4x2x16xi8>{%29, %30}
  %32 = flow.dispatch.workgroups[%27, %28, %21, %22](%31, %27, %28, %21, %22, %29, %30) : (tensor<?x2x4x8x?x4x2x16xi8>{%27, %28}, index, index, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%21, %22} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) {
    %54 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %55 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %56 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg8, %arg9}
    %57 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%54, %55}
    %58 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %59 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %60 = flow.dispatch.tensor.load %56, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, 2, 4, 8, %59, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg8, %arg9} -> tensor<?x2x4x8x?x4x2x16xi8>
    %61 = tensor.empty(%54, %55) : tensor<?x?x4x2x4x16x2x8xi8>
    %62 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%60 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%61 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    flow.dispatch.tensor.store %62, %57, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, %55, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%54, %55}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %35 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %37 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%35, %36}
  %38 = flow.dispatch.workgroups[%6, %7, %35, %36](%8, %37, %6, %7, %35, %36) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%35, %36}, index, index, index, index) -> %37{%35, %36} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
    %54 = flow.dispatch.workload.ordinal %arg5, 0 : index
    %55 = flow.dispatch.workload.ordinal %arg6, 1 : index
    %56 = flow.dispatch.workload.ordinal %arg7, 2 : index
    %57 = flow.dispatch.workload.ordinal %arg8, 3 : index
    %58 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%54, %55}
    %59 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%56, %57}
    %60 = flow.dispatch.tensor.load %58, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%54, %55} -> tensor<?x?xi32>
    flow.dispatch.tensor.store %60, %59, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%56, %57}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %39 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %40 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %41 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %42 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %43 = flow.tensor.reshape %38 : tensor<?x?xi32>{%35, %36} -> tensor<?x8x4x4x?x4x2x16xi32>{%41, %42}
  %44 = flow.dispatch.workgroups[%39, %40, %33, %34](%43, %39, %40, %33, %34, %41, %42) : (tensor<?x8x4x4x?x4x2x16xi32>{%39, %40}, index, index, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%33, %34} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) {
    %54 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %55 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %56 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9}
    %57 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%54, %55}
    %58 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %59 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %60 = flow.dispatch.tensor.load %56, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, 8, 4, 4, %59, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9} -> tensor<?x8x4x4x?x4x2x16xi32>
    %61 = tensor.empty(%54, %55) : tensor<?x?x8x4x2x4x16x4xi32>
    %62 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%60 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%61 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    flow.dispatch.tensor.store %62, %57, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, %55, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%54, %55}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %45 = flow.dispatch.workgroups[%9, %10, %21, %22, %33, %34](%20, %32, %44, %9, %10, %21, %22, %33, %34) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%21, %22}, tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index, index, index, index, index) -> %44{%33, %34} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) {
    %54 = flow.dispatch.workload.ordinal %arg6, 0 : index
    %55 = flow.dispatch.workload.ordinal %arg7, 1 : index
    %56 = flow.dispatch.workload.ordinal %arg8, 2 : index
    %57 = flow.dispatch.workload.ordinal %arg9, 3 : index
    %58 = flow.dispatch.workload.ordinal %arg10, 4 : index
    %59 = flow.dispatch.workload.ordinal %arg11, 5 : index
    %60 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%54, %55}
    %61 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%56, %57}
    %62 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%58, %59}
    %63 = flow.dispatch.tensor.load %60, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%54, %55, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%54, %55} -> tensor<?x?x8x4x16x2x8xi8>
    %64 = flow.dispatch.tensor.load %61, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%56, %57} -> tensor<?x?x4x2x4x16x2x8xi8>
    %65 = flow.dispatch.tensor.load %62, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, %59, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%58, %59} -> tensor<?x?x8x4x2x4x16x4xi32>
    %66 = iree_gpu.multi_mma %63, %64, %65 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    flow.dispatch.tensor.store %66, %62, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, %59, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%58, %59}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
    flow.return %x, %y, %z : index, index, index
  }
  %46 = flow.dispatch.workgroups[%33, %34](%45, %33, %34) : (tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) {
    %54 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %55 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %56 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%54, %55}
    %57 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%54, %55}
    %58 = flow.dispatch.tensor.load %56, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, %55, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%54, %55} -> tensor<?x?x8x4x2x4x16x4xi32>
    %59 = tensor.empty(%54, %55) : tensor<?x8x4x4x?x4x2x16xi32>
    %60 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%58 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%59 : tensor<?x8x4x4x?x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x8x4x4x?x4x2x16xi32>
    flow.dispatch.tensor.store %60, %57, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, 8, 4, 4, %55, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%54, %55}
    flow.return
  } count(%arg3: index, %arg4: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4
    flow.return %x, %y, %z : index, index, index
  }
  %47 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %48 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %49 = flow.tensor.reshape %46 : tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} -> tensor<?x?xi32>{%47, %48}
  %50 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %51 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %52 = flow.dispatch.workgroups[%50, %51, %6, %7](%49, %6, %7, %47, %48) : (tensor<?x?xi32>{%50, %51}, index, index, index, index) -> tensor<?x?xi32>{%6, %7} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) {
    %54 = flow.dispatch.workload.ordinal %arg4, 2 : index
    %55 = flow.dispatch.workload.ordinal %arg5, 3 : index
    %56 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg6, %arg7}
    %57 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%54, %55}
    %58 = flow.dispatch.tensor.load %56, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg6, %arg7} -> tensor<?x?xi32>
    flow.dispatch.tensor.store %58, %57, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%54, %55}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %53 = hal.tensor.export %52 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %53 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12}
  %14 = flow.dispatch.workgroups[%0, %1, %11, %12](%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
    %38 = flow.dispatch.workload.ordinal %arg5, 0 : index
    %39 = flow.dispatch.workload.ordinal %arg6, 1 : index
    %40 = flow.dispatch.workload.ordinal %arg7, 2 : index
    %41 = flow.dispatch.workload.ordinal %arg8, 3 : index
    %42 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39}
    %43 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41}
    %44 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39} -> tensor<?x?xi8>
    flow.dispatch.tensor.store %44, %43, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %15 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%9, %10}
  %16 = flow.dispatch.workgroups[%9, %10, %9, %10](%15, %9, %10, %9, %10, %9, %10) : (tensor<?x8x16x?x2x4x8xi8>{%9, %10}, index, index, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) {
    %38 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %39 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg8, %arg9}
    %41 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39}
    %42 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %43 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%42, 8, 16, %43, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg8, %arg9} -> tensor<?x8x16x?x2x4x8xi8>
    %45 = tensor.empty(%38, %39) : tensor<?x?x8x4x16x2x8xi8>
    %46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<?x8x16x?x2x4x8xi8>) outs(%45 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    flow.dispatch.tensor.store %46, %41, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %21 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%19, %20}
  %22 = flow.dispatch.workgroups[%3, %4, %19, %20](%5, %21, %3, %4, %19, %20) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%19, %20}, index, index, index, index) -> %21{%19, %20} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
    %38 = flow.dispatch.workload.ordinal %arg5, 0 : index
    %39 = flow.dispatch.workload.ordinal %arg6, 1 : index
    %40 = flow.dispatch.workload.ordinal %arg7, 2 : index
    %41 = flow.dispatch.workload.ordinal %arg8, 3 : index
    %42 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39}
    %43 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41}
    %44 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39} -> tensor<?x?xi8>
    flow.dispatch.tensor.store %44, %43, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %23 = flow.tensor.reshape %22 : tensor<?x?xi8>{%19, %20} -> tensor<?x2x4x8x?x4x2x16xi8>{%18, %17}
  %24 = flow.dispatch.workgroups[%18, %17, %17, %18](%23, %18, %17, %17, %18, %18, %17) : (tensor<?x2x4x8x?x4x2x16xi8>{%18, %17}, index, index, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%17, %18} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) {
    %38 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %39 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg8, %arg9}
    %41 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%38, %39}
    %42 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %43 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, 2, 4, 8, %43, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg8, %arg9} -> tensor<?x2x4x8x?x4x2x16xi8>
    %45 = tensor.empty(%38, %39) : tensor<?x?x4x2x4x16x2x8xi8>
    %46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%45 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    flow.dispatch.tensor.store %46, %41, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%38, %39}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %29 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%27, %28}
  %30 = flow.dispatch.workgroups[%6, %7, %27, %28](%8, %29, %6, %7, %27, %28) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%27, %28}, index, index, index, index) -> %29{%27, %28} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
    %38 = flow.dispatch.workload.ordinal %arg5, 0 : index
    %39 = flow.dispatch.workload.ordinal %arg6, 1 : index
    %40 = flow.dispatch.workload.ordinal %arg7, 2 : index
    %41 = flow.dispatch.workload.ordinal %arg8, 3 : index
    %42 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%38, %39}
    %43 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%40, %41}
    %44 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%38, %39} -> tensor<?x?xi32>
    flow.dispatch.tensor.store %44, %43, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%40, %41}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %31 = flow.tensor.reshape %30 : tensor<?x?xi32>{%27, %28} -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26}
  %32 = flow.dispatch.workgroups[%25, %26, %25, %26](%31, %25, %26, %25, %26, %25, %26) : (tensor<?x8x4x4x?x4x2x16xi32>{%25, %26}, index, index, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%25, %26} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) {
    %38 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %39 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9}
    %41 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39}
    %42 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %43 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, 8, 4, 4, %43, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9} -> tensor<?x8x4x4x?x4x2x16xi32>
    %45 = tensor.empty(%38, %39) : tensor<?x?x8x4x2x4x16x4xi32>
    %46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%45 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    flow.dispatch.tensor.store %46, %41, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %33 = flow.dispatch.workgroups[%9, %10, %17, %18, %25, %26](%16, %24, %32, %9, %10, %17, %18, %25, %26) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%17, %18}, tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index, index, index, index, index) -> %32{%25, %26} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) {
    %38 = flow.dispatch.workload.ordinal %arg6, 0 : index
    %39 = flow.dispatch.workload.ordinal %arg7, 1 : index
    %40 = flow.dispatch.workload.ordinal %arg8, 2 : index
    %41 = flow.dispatch.workload.ordinal %arg9, 3 : index
    %42 = flow.dispatch.workload.ordinal %arg10, 4 : index
    %43 = flow.dispatch.workload.ordinal %arg11, 5 : index
    %44 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39}
    %45 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%40, %41}
    %46 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%42, %43}
    %47 = flow.dispatch.tensor.load %44, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39} -> tensor<?x?x8x4x16x2x8xi8>
    %48 = flow.dispatch.tensor.load %45, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%40, %41, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%40, %41} -> tensor<?x?x4x2x4x16x2x8xi8>
    %49 = flow.dispatch.tensor.load %46, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, %43, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%42, %43} -> tensor<?x?x8x4x2x4x16x4xi32>
    %50 = iree_gpu.multi_mma %47, %48, %49 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    flow.dispatch.tensor.store %50, %46, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, %43, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%42, %43}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
    flow.return %x, %y, %z : index, index, index
  }
  %34 = flow.dispatch.workgroups[%25, %26](%33, %25, %26) : (tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) {
    %38 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %39 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39}
    %41 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%38, %39}
    %42 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39} -> tensor<?x?x8x4x2x4x16x4xi32>
    %43 = tensor.empty(%38, %39) : tensor<?x8x4x4x?x4x2x16xi32>
    %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%42 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%43 : tensor<?x8x4x4x?x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x8x4x4x?x4x2x16xi32>
    flow.dispatch.tensor.store %44, %41, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, 8, 4, 4, %39, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%38, %39}
    flow.return
  } count(%arg3: index, %arg4: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4
    flow.return %x, %y, %z : index, index, index
  }
  %35 = flow.tensor.reshape %34 : tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} -> tensor<?x?xi32>{%27, %28}
  %36 = flow.dispatch.workgroups[%27, %28, %6, %7](%35, %6, %7, %27, %28) : (tensor<?x?xi32>{%27, %28}, index, index, index, index) -> tensor<?x?xi32>{%6, %7} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) {
    %38 = flow.dispatch.workload.ordinal %arg4, 2 : index
    %39 = flow.dispatch.workload.ordinal %arg5, 3 : index
    %40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg6, %arg7}
    %41 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%38, %39}
    %42 = flow.dispatch.tensor.load %40, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg6, %arg7} -> tensor<?x?xi32>
    flow.dispatch.tensor.store %42, %41, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%38, %39}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %37 = hal.tensor.export %36 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %37 : !hal.buffer_view
 }

 // -----// IR Dump After InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- //
 util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0]
  %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1]
  %11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0]
  %12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1]
  %13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12}
  %14 = flow.dispatch.workgroups[%0, %1, %11, %12](%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
    %38 = flow.dispatch.workload.ordinal %arg5, 0 : index
    %39 = flow.dispatch.workload.ordinal %arg6, 1 : index
    %40 = flow.dispatch.workload.ordinal %arg7, 2 : index
    %41 = flow.dispatch.workload.ordinal %arg8, 3 : index
    %42 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39}
    %43 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41}
    %44 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39} -> tensor<?x?xi8>
    flow.dispatch.tensor.store %44, %43, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %15 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%9, %10}
  %16 = flow.dispatch.workgroups[%9, %10, %9, %10](%15, %9, %10, %9, %10, %9, %10) : (tensor<?x8x16x?x2x4x8xi8>{%9, %10}, index, index, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) {
    %38 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %39 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg8, %arg9}
    %41 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39}
    %42 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %43 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%42, 8, 16, %43, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg8, %arg9} -> tensor<?x8x16x?x2x4x8xi8>
    %45 = tensor.empty(%38, %39) : tensor<?x?x8x4x16x2x8xi8>
    %46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<?x8x16x?x2x4x8xi8>) outs(%45 : tensor<?x?x8x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x8x4x16x2x8xi8>
    flow.dispatch.tensor.store %46, %41, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4]
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3]
  %19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3]
  %20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4]
  %21 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%19, %20}
  %22 = flow.dispatch.workgroups[%3, %4, %19, %20](%5, %21, %3, %4, %19, %20) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%19, %20}, index, index, index, index) -> %21{%19, %20} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
    %38 = flow.dispatch.workload.ordinal %arg5, 0 : index
    %39 = flow.dispatch.workload.ordinal %arg6, 1 : index
    %40 = flow.dispatch.workload.ordinal %arg7, 2 : index
    %41 = flow.dispatch.workload.ordinal %arg8, 3 : index
    %42 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39}
    %43 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41}
    %44 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39} -> tensor<?x?xi8>
    flow.dispatch.tensor.store %44, %43, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %23 = flow.tensor.reshape %22 : tensor<?x?xi8>{%19, %20} -> tensor<?x2x4x8x?x4x2x16xi8>{%18, %17}
  %24 = flow.dispatch.workgroups[%18, %17, %17, %18](%23, %18, %17, %17, %18, %18, %17) : (tensor<?x2x4x8x?x4x2x16xi8>{%18, %17}, index, index, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%17, %18} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) {
    %38 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %39 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg8, %arg9}
    %41 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%38, %39}
    %42 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %43 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, 2, 4, 8, %43, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg8, %arg9} -> tensor<?x2x4x8x?x4x2x16xi8>
    %45 = tensor.empty(%38, %39) : tensor<?x?x4x2x4x16x2x8xi8>
    %46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%45 : tensor<?x?x4x2x4x16x2x8xi8>) {
    ^bb0(%in: i8, %out: i8):
      linalg.yield %in : i8
    } -> tensor<?x?x4x2x4x16x2x8xi8>
    flow.dispatch.tensor.store %46, %41, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%38, %39}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6]
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7]
  %27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6]
  %28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7]
  %29 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%27, %28}
  %30 = flow.dispatch.workgroups[%6, %7, %27, %28](%8, %29, %6, %7, %27, %28) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%27, %28}, index, index, index, index) -> %29{%27, %28} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
    %38 = flow.dispatch.workload.ordinal %arg5, 0 : index
    %39 = flow.dispatch.workload.ordinal %arg6, 1 : index
    %40 = flow.dispatch.workload.ordinal %arg7, 2 : index
    %41 = flow.dispatch.workload.ordinal %arg8, 3 : index
    %42 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%38, %39}
    %43 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%40, %41}
    %44 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%38, %39} -> tensor<?x?xi32>
    flow.dispatch.tensor.store %44, %43, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%40, %41}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %31 = flow.tensor.reshape %30 : tensor<?x?xi32>{%27, %28} -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26}
  %32 = flow.dispatch.workgroups[%25, %26, %25, %26](%31, %25, %26, %25, %26, %25, %26) : (tensor<?x8x4x4x?x4x2x16xi32>{%25, %26}, index, index, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%25, %26} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) {
    %38 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %39 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9}
    %41 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39}
    %42 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %43 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, 8, 4, 4, %43, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9} -> tensor<?x8x4x4x?x4x2x16xi32>
    %45 = tensor.empty(%38, %39) : tensor<?x?x8x4x2x4x16x4xi32>
    %46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%45 : tensor<?x?x8x4x2x4x16x4xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x?x8x4x2x4x16x4xi32>
    flow.dispatch.tensor.store %46, %41, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %33 = flow.dispatch.workgroups[%9, %10, %17, %18, %25, %26](%16, %24, %32, %9, %10, %17, %18, %25, %26) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%17, %18}, tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index, index, index, index, index) -> %32{%25, %26} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) {
    %38 = flow.dispatch.workload.ordinal %arg6, 0 : index
    %39 = flow.dispatch.workload.ordinal %arg7, 1 : index
    %40 = flow.dispatch.workload.ordinal %arg8, 2 : index
    %41 = flow.dispatch.workload.ordinal %arg9, 3 : index
    %42 = flow.dispatch.workload.ordinal %arg10, 4 : index
    %43 = flow.dispatch.workload.ordinal %arg11, 5 : index
    %44 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39}
    %45 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%40, %41}
    %46 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%42, %43}
    %47 = flow.dispatch.tensor.load %44, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39} -> tensor<?x?x8x4x16x2x8xi8>
    %48 = flow.dispatch.tensor.load %45, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%40, %41, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%40, %41} -> tensor<?x?x4x2x4x16x2x8xi8>
    %49 = flow.dispatch.tensor.load %46, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, %43, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%42, %43} -> tensor<?x?x8x4x2x4x16x4xi32>
    %50 = iree_gpu.multi_mma %47, %48, %49 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    flow.dispatch.tensor.store %50, %46, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, %43, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%42, %43}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
    flow.return %x, %y, %z : index, index, index
  }
  %34 = flow.dispatch.workgroups[%25, %26](%33, %25, %26) : (tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) {
    %38 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %39 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39}
    %41 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%38, %39}
    %42 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39} -> tensor<?x?x8x4x2x4x16x4xi32>
    %43 = tensor.empty(%38, %39) : tensor<?x8x4x4x?x4x2x16xi32>
    %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%42 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%43 : tensor<?x8x4x4x?x4x2x16xi32>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    } -> tensor<?x8x4x4x?x4x2x16xi32>
    flow.dispatch.tensor.store %44, %41, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, 8, 4, 4, %39, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%38, %39}
    flow.return
  } count(%arg3: index, %arg4: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4
    flow.return %x, %y, %z : index, index, index
  }
  %35 = flow.tensor.reshape %34 : tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} -> tensor<?x?xi32>{%27, %28}
  %36 = flow.dispatch.workgroups[%27, %28, %6, %7](%35, %6, %7, %27, %28) : (tensor<?x?xi32>{%27, %28}, index, index, index, index) -> tensor<?x?xi32>{%6, %7} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) {
    %38 = flow.dispatch.workload.ordinal %arg4, 2 : index
    %39 = flow.dispatch.workload.ordinal %arg5, 3 : index
    %40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg6, %arg7}
    %41 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%38, %39}
    %42 = flow.dispatch.tensor.load %40, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg6, %arg7} -> tensor<?x?xi32>
    flow.dispatch.tensor.store %42, %41, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%38, %39}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %37 = hal.tensor.export %36 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
  util.return %37 : !hal.buffer_view
 }

 // -----// IR Dump After OutlineDispatchExternsPass (iree-flow-outline-dispatch-externs) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map1 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map2 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map3 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #map4 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>
 #map5 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>
 #map6 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>
 #map7 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>
 #map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>
 #map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>
 #map10 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map11 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map12 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = affine.apply #map1()[%1]
    %11 = affine.apply #map2()[%0]
    %12 = affine.apply #map3()[%1]
    %13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12}
    %14 = flow.dispatch.workgroups[%0, %1, %11, %12](%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
      %38 = flow.dispatch.workload.ordinal %arg5, 0 : index
      %39 = flow.dispatch.workload.ordinal %arg6, 1 : index
      %40 = flow.dispatch.workload.ordinal %arg7, 2 : index
      %41 = flow.dispatch.workload.ordinal %arg8, 3 : index
      %42 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39}
      %43 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41}
      %44 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39} -> tensor<?x?xi8>
      flow.dispatch.tensor.store %44, %43, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
      flow.return %x, %y, %z : index, index, index
    }
    %15 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%9, %10}
    %16 = flow.dispatch.workgroups[%9, %10, %9, %10](%15, %9, %10, %9, %10, %9, %10) : (tensor<?x8x16x?x2x4x8xi8>{%9, %10}, index, index, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) {
      %38 = flow.dispatch.workload.ordinal %arg6, 2 : index
      %39 = flow.dispatch.workload.ordinal %arg7, 3 : index
      %40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg8, %arg9}
      %41 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39}
      %42 = flow.dispatch.workload.ordinal %arg4, 0 : index
      %43 = flow.dispatch.workload.ordinal %arg5, 1 : index
      %44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%42, 8, 16, %43, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg8, %arg9} -> tensor<?x8x16x?x2x4x8xi8>
      %45 = tensor.empty(%38, %39) : tensor<?x?x8x4x16x2x8xi8>
      %46 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<?x8x16x?x2x4x8xi8>) outs(%45 : tensor<?x?x8x4x16x2x8xi8>) {
      ^bb0(%in: i8, %out: i8):
        linalg.yield %in : i8
      } -> tensor<?x?x8x4x16x2x8xi8>
      flow.dispatch.tensor.store %46, %41, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
      flow.return %x, %y, %z : index, index, index
    }
    %17 = affine.apply #map()[%4]
    %18 = affine.apply #map1()[%3]
    %19 = affine.apply #map3()[%3]
    %20 = affine.apply #map2()[%4]
    %21 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%19, %20}
    %22 = flow.dispatch.workgroups[%3, %4, %19, %20](%5, %21, %3, %4, %19, %20) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%19, %20}, index, index, index, index) -> %21{%19, %20} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
      %38 = flow.dispatch.workload.ordinal %arg5, 0 : index
      %39 = flow.dispatch.workload.ordinal %arg6, 1 : index
      %40 = flow.dispatch.workload.ordinal %arg7, 2 : index
      %41 = flow.dispatch.workload.ordinal %arg8, 3 : index
      %42 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39}
      %43 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41}
      %44 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39} -> tensor<?x?xi8>
      flow.dispatch.tensor.store %44, %43, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
      flow.return %x, %y, %z : index, index, index
    }
    %23 = flow.tensor.reshape %22 : tensor<?x?xi8>{%19, %20} -> tensor<?x2x4x8x?x4x2x16xi8>{%18, %17}
    %24 = flow.dispatch.workgroups[%18, %17, %17, %18](%23, %18, %17, %17, %18, %18, %17) : (tensor<?x2x4x8x?x4x2x16xi8>{%18, %17}, index, index, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%17, %18} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) {
      %38 = flow.dispatch.workload.ordinal %arg6, 2 : index
      %39 = flow.dispatch.workload.ordinal %arg7, 3 : index
      %40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg8, %arg9}
      %41 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%38, %39}
      %42 = flow.dispatch.workload.ordinal %arg4, 0 : index
      %43 = flow.dispatch.workload.ordinal %arg5, 1 : index
      %44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, 2, 4, 8, %43, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg8, %arg9} -> tensor<?x2x4x8x?x4x2x16xi8>
      %45 = tensor.empty(%38, %39) : tensor<?x?x4x2x4x16x2x8xi8>
      %46 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%45 : tensor<?x?x4x2x4x16x2x8xi8>) {
      ^bb0(%in: i8, %out: i8):
        linalg.yield %in : i8
      } -> tensor<?x?x4x2x4x16x2x8xi8>
      flow.dispatch.tensor.store %46, %41, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%38, %39}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
      flow.return %x, %y, %z : index, index, index
    }
    %25 = affine.apply #map()[%6]
    %26 = affine.apply #map()[%7]
    %27 = affine.apply #map2()[%6]
    %28 = affine.apply #map2()[%7]
    %29 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%27, %28}
    %30 = flow.dispatch.workgroups[%6, %7, %27, %28](%8, %29, %6, %7, %27, %28) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%27, %28}, index, index, index, index) -> %29{%27, %28} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
      %38 = flow.dispatch.workload.ordinal %arg5, 0 : index
      %39 = flow.dispatch.workload.ordinal %arg6, 1 : index
      %40 = flow.dispatch.workload.ordinal %arg7, 2 : index
      %41 = flow.dispatch.workload.ordinal %arg8, 3 : index
      %42 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%38, %39}
      %43 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%40, %41}
      %44 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%38, %39} -> tensor<?x?xi32>
      flow.dispatch.tensor.store %44, %43, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%40, %41}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
      flow.return %x, %y, %z : index, index, index
    }
    %31 = flow.tensor.reshape %30 : tensor<?x?xi32>{%27, %28} -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26}
    %32 = flow.dispatch.workgroups[%25, %26, %25, %26](%31, %25, %26, %25, %26, %25, %26) : (tensor<?x8x4x4x?x4x2x16xi32>{%25, %26}, index, index, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%25, %26} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) {
      %38 = flow.dispatch.workload.ordinal %arg6, 2 : index
      %39 = flow.dispatch.workload.ordinal %arg7, 3 : index
      %40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9}
      %41 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39}
      %42 = flow.dispatch.workload.ordinal %arg4, 0 : index
      %43 = flow.dispatch.workload.ordinal %arg5, 1 : index
      %44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, 8, 4, 4, %43, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9} -> tensor<?x8x4x4x?x4x2x16xi32>
      %45 = tensor.empty(%38, %39) : tensor<?x?x8x4x2x4x16x4xi32>
      %46 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%45 : tensor<?x?x8x4x2x4x16x4xi32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      } -> tensor<?x?x8x4x2x4x16x4xi32>
      flow.dispatch.tensor.store %46, %41, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
      flow.return %x, %y, %z : index, index, index
    }
    %33 = flow.dispatch.workgroups[%9, %10, %17, %18, %25, %26](%16, %24, %32, %9, %10, %17, %18, %25, %26) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%17, %18}, tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index, index, index, index, index) -> %32{%25, %26} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) {
      %38 = flow.dispatch.workload.ordinal %arg6, 0 : index
      %39 = flow.dispatch.workload.ordinal %arg7, 1 : index
      %40 = flow.dispatch.workload.ordinal %arg8, 2 : index
      %41 = flow.dispatch.workload.ordinal %arg9, 3 : index
      %42 = flow.dispatch.workload.ordinal %arg10, 4 : index
      %43 = flow.dispatch.workload.ordinal %arg11, 5 : index
      %44 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39}
      %45 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%40, %41}
      %46 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%42, %43}
      %47 = flow.dispatch.tensor.load %44, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39} -> tensor<?x?x8x4x16x2x8xi8>
      %48 = flow.dispatch.tensor.load %45, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%40, %41, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%40, %41} -> tensor<?x?x4x2x4x16x2x8xi8>
      %49 = flow.dispatch.tensor.load %46, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, %43, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%42, %43} -> tensor<?x?x8x4x2x4x16x4xi32>
      %50 = iree_gpu.multi_mma %47, %48, %49 {indexing_maps = [#map10, #map11, #map12], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
      flow.dispatch.tensor.store %50, %46, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, %43, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%42, %43}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
      flow.return %x, %y, %z : index, index, index
    }
    %34 = flow.dispatch.workgroups[%25, %26](%33, %25, %26) : (tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) {
      %38 = flow.dispatch.workload.ordinal %arg4, 0 : index
      %39 = flow.dispatch.workload.ordinal %arg5, 1 : index
      %40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39}
      %41 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%38, %39}
      %42 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39} -> tensor<?x?x8x4x2x4x16x4xi32>
      %43 = tensor.empty(%38, %39) : tensor<?x8x4x4x?x4x2x16xi32>
      %44 = linalg.generic {indexing_maps = [#map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%42 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%43 : tensor<?x8x4x4x?x4x2x16xi32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      } -> tensor<?x8x4x4x?x4x2x16xi32>
      flow.dispatch.tensor.store %44, %41, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, 8, 4, 4, %39, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%38, %39}
      flow.return
    } count(%arg3: index, %arg4: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4
      flow.return %x, %y, %z : index, index, index
    }
    %35 = flow.tensor.reshape %34 : tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} -> tensor<?x?xi32>{%27, %28}
    %36 = flow.dispatch.workgroups[%27, %28, %6, %7](%35, %6, %7, %27, %28) : (tensor<?x?xi32>{%27, %28}, index, index, index, index) -> tensor<?x?xi32>{%6, %7} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) {
      %38 = flow.dispatch.workload.ordinal %arg4, 2 : index
      %39 = flow.dispatch.workload.ordinal %arg5, 3 : index
      %40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg6, %arg7}
      %41 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%38, %39}
      %42 = flow.dispatch.tensor.load %40, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg6, %arg7} -> tensor<?x?xi32>
      flow.dispatch.tensor.store %42, %41, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%38, %39}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
      flow.return %x, %y, %z : index, index, index
    }
    %37 = hal.tensor.export %36 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %37 : !hal.buffer_view
  }
 }


 // -----// IR Dump After OutlineDispatchRegionsPass (iree-flow-outline-dispatch-regions) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>
 #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>
 #map4 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>
 #map5 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>
 #map6 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map7 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map8 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map9 = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map10 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map11 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map12 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  flow.executable private @foo_dispatch_0 {
    flow.executable.export public @foo_dispatch_0 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @foo_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg1: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index) {
        %0 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%0, %1} -> tensor<?x?xi8>
        flow.dispatch.tensor.store %6, %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @foo_dispatch_1 {
    flow.executable.export public @foo_dispatch_1 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @foo_dispatch_1(%arg0: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) {
        %0 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg5, %arg6}
        %3 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%0, %1}
        %4 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %5 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%4, 8, 16, %5, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg5, %arg6} -> tensor<?x8x16x?x2x4x8xi8>
        %7 = tensor.empty(%0, %1) : tensor<?x?x8x4x16x2x8xi8>
        %8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<?x8x16x?x2x4x8xi8>) outs(%7 : tensor<?x?x8x4x16x2x8xi8>) {
        ^bb0(%in: i8, %out: i8):
          linalg.yield %in : i8
        } -> tensor<?x?x8x4x16x2x8xi8>
        flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%0, %1}
        return
      }
    }
  }
  flow.executable private @foo_dispatch_2 {
    flow.executable.export public @foo_dispatch_2 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @foo_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg1: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index) {
        %0 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%0, %1} -> tensor<?x?xi8>
        flow.dispatch.tensor.store %6, %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @foo_dispatch_3 {
    flow.executable.export public @foo_dispatch_3 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @foo_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) {
        %0 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg5, %arg6}
        %3 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%0, %1}
        %4 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %5 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%4, 2, 4, 8, %5, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg5, %arg6} -> tensor<?x2x4x8x?x4x2x16xi8>
        %7 = tensor.empty(%0, %1) : tensor<?x?x4x2x4x16x2x8xi8>
        %8 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%7 : tensor<?x?x4x2x4x16x2x8xi8>) {
        ^bb0(%in: i8, %out: i8):
          linalg.yield %in : i8
        } -> tensor<?x?x4x2x4x16x2x8xi8>
        flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%0, %1}
        return
      }
    }
  }
  flow.executable private @foo_dispatch_4 {
    flow.executable.export public @foo_dispatch_4 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @foo_dispatch_4(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index) {
        %0 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32>
        flow.dispatch.tensor.store %6, %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @foo_dispatch_5 {
    flow.executable.export public @foo_dispatch_5 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @foo_dispatch_5(%arg0: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) {
        %0 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg5, %arg6}
        %3 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%0, %1}
        %4 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %5 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%4, 8, 4, 4, %5, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg5, %arg6} -> tensor<?x8x4x4x?x4x2x16xi32>
        %7 = tensor.empty(%0, %1) : tensor<?x?x8x4x2x4x16x4xi32>
        %8 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%7 : tensor<?x?x8x4x2x4x16x4xi32>) {
        ^bb0(%in: i32, %out: i32):
          linalg.yield %in : i32
        } -> tensor<?x?x8x4x2x4x16x4xi32>
        flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%0, %1}
        return
      }
    }
  }
  flow.executable private @foo_dispatch_6 {
    flow.executable.export public @foo_dispatch_6 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @foo_dispatch_6(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%0, %1}
        %7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%2, %3}
        %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%0, %1} -> tensor<?x?x8x4x16x2x8xi8>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%2, %3, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%2, %3} -> tensor<?x?x4x2x4x16x2x8xi8>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%4, %5, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%4, %5} -> tensor<?x?x8x4x2x4x16x4xi32>
        %12 = iree_gpu.multi_mma %9, %10, %11 {indexing_maps = [#map6, #map7, #map8], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%4, %5, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%4, %5}
        return
      }
    }
  }
  flow.executable private @foo_dispatch_7 {
    flow.executable.export public @foo_dispatch_7 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @foo_dispatch_7(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg1: index, %arg2: index, %arg3: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) {
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%0, %1}
        %3 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%0, %1}
        %4 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%0, %1} -> tensor<?x?x8x4x2x4x16x4xi32>
        %5 = tensor.empty(%0, %1) : tensor<?x8x4x4x?x4x2x16xi32>
        %6 = linalg.generic {indexing_maps = [#map5, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%4 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%5 : tensor<?x8x4x4x?x4x2x16xi32>) {
        ^bb0(%in: i32, %out: i32):
          linalg.yield %in : i32
        } -> tensor<?x8x4x4x?x4x2x16xi32>
        flow.dispatch.tensor.store %6, %3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%0, 8, 4, 4, %1, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%0, %1}
        return
      }
    }
  }
  flow.executable private @foo_dispatch_8 {
    flow.executable.export public @foo_dispatch_8 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @foo_dispatch_8(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) {
        %0 = flow.dispatch.workload.ordinal %arg1, 2 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 3 : index
        %2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg3, %arg4}
        %3 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%0, %1}
        %4 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg3, %arg4} -> tensor<?x?xi32>
        flow.dispatch.tensor.store %4, %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%0, %1}
        return
      }
    }
  }
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map9()[%0]
    %10 = affine.apply #map10()[%1]
    %11 = affine.apply #map11()[%0]
    %12 = affine.apply #map12()[%1]
    %13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12}
    %14 = flow.dispatch @foo_dispatch_0::@foo_dispatch_0[%0, %1, %11, %12](%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12}
    %15 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%9, %10}
    %16 = flow.dispatch @foo_dispatch_1::@foo_dispatch_1[%9, %10, %9, %10](%15, %9, %10, %9, %10, %9, %10) : (tensor<?x8x16x?x2x4x8xi8>{%9, %10}, index, index, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10}
    %17 = affine.apply #map9()[%4]
    %18 = affine.apply #map10()[%3]
    %19 = affine.apply #map12()[%3]
    %20 = affine.apply #map11()[%4]
    %21 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%19, %20}
    %22 = flow.dispatch @foo_dispatch_2::@foo_dispatch_2[%3, %4, %19, %20](%5, %21, %3, %4, %19, %20) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%19, %20}, index, index, index, index) -> %21{%19, %20}
    %23 = flow.tensor.reshape %22 : tensor<?x?xi8>{%19, %20} -> tensor<?x2x4x8x?x4x2x16xi8>{%18, %17}
    %24 = flow.dispatch @foo_dispatch_3::@foo_dispatch_3[%18, %17, %17, %18](%23, %18, %17, %17, %18, %18, %17) : (tensor<?x2x4x8x?x4x2x16xi8>{%18, %17}, index, index, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%17, %18}
    %25 = affine.apply #map9()[%6]
    %26 = affine.apply #map9()[%7]
    %27 = affine.apply #map11()[%6]
    %28 = affine.apply #map11()[%7]
    %29 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%27, %28}
    %30 = flow.dispatch @foo_dispatch_4::@foo_dispatch_4[%6, %7, %27, %28](%8, %29, %6, %7, %27, %28) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%27, %28}, index, index, index, index) -> %29{%27, %28}
    %31 = flow.tensor.reshape %30 : tensor<?x?xi32>{%27, %28} -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26}
    %32 = flow.dispatch @foo_dispatch_5::@foo_dispatch_5[%25, %26, %25, %26](%31, %25, %26, %25, %26, %25, %26) : (tensor<?x8x4x4x?x4x2x16xi32>{%25, %26}, index, index, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}
    %33 = flow.dispatch @foo_dispatch_6::@foo_dispatch_6[%9, %10, %17, %18, %25, %26](%16, %24, %32, %9, %10, %17, %18, %25, %26) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%17, %18}, tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index, index, index, index, index) -> %32{%25, %26}
    %34 = flow.dispatch @foo_dispatch_7::@foo_dispatch_7[%25, %26](%33, %25, %26) : (tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26}
    %35 = flow.tensor.reshape %34 : tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} -> tensor<?x?xi32>{%27, %28}
    %36 = flow.dispatch @foo_dispatch_8::@foo_dispatch_8[%27, %28, %6, %7](%35, %6, %7, %27, %28) : (tensor<?x?xi32>{%27, %28}, index, index, index, index) -> tensor<?x?xi32>{%6, %7}
    %37 = hal.tensor.export %36 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %37 : !hal.buffer_view
  }
 }


 // -----// IR Dump After AnnotateDispatchesPass (iree-flow-annotate-dispatches) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>
 #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>
 #map4 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>
 #map5 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>
 #map6 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map7 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map8 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map9 = affine_map<()[s0] -> (s0 ceildiv 128)>
 #map10 = affine_map<()[s0] -> (s0 ceildiv 64)>
 #map11 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>
 #map12 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  flow.executable private @foo_dispatch_0 {
    flow.executable.export public @foo_dispatch_0_slow_memcpy workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @foo_dispatch_0_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg1: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index) {
        %0 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%0, %1} -> tensor<?x?xi8>
        flow.dispatch.tensor.store %6, %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @foo_dispatch_1 {
    flow.executable.export public @foo_dispatch_1_transpose_DxDx8x16x2x4x8_i8 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @foo_dispatch_1_transpose_DxDx8x16x2x4x8_i8(%arg0: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) {
        %0 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg5, %arg6}
        %3 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%0, %1}
        %4 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %5 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%4, 8, 16, %5, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg5, %arg6} -> tensor<?x8x16x?x2x4x8xi8>
        %7 = tensor.empty(%0, %1) : tensor<?x?x8x4x16x2x8xi8>
        %8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<?x8x16x?x2x4x8xi8>) outs(%7 : tensor<?x?x8x4x16x2x8xi8>) {
        ^bb0(%in: i8, %out: i8):
          linalg.yield %in : i8
        } -> tensor<?x?x8x4x16x2x8xi8>
        flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%0, %1}
        return
      }
    }
  }
  flow.executable private @foo_dispatch_2 {
    flow.executable.export public @foo_dispatch_2_slow_memcpy workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @foo_dispatch_2_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg1: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index) {
        %0 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%0, %1} -> tensor<?x?xi8>
        flow.dispatch.tensor.store %6, %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @foo_dispatch_3 {
    flow.executable.export public @foo_dispatch_3_transpose_DxDx4x2x16x2x4x8_i8 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @foo_dispatch_3_transpose_DxDx4x2x16x2x4x8_i8(%arg0: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) {
        %0 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg5, %arg6}
        %3 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%0, %1}
        %4 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %5 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%4, 2, 4, 8, %5, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg5, %arg6} -> tensor<?x2x4x8x?x4x2x16xi8>
        %7 = tensor.empty(%0, %1) : tensor<?x?x4x2x4x16x2x8xi8>
        %8 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%7 : tensor<?x?x4x2x4x16x2x8xi8>) {
        ^bb0(%in: i8, %out: i8):
          linalg.yield %in : i8
        } -> tensor<?x?x4x2x4x16x2x8xi8>
        flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%0, %1}
        return
      }
    }
  }
  flow.executable private @foo_dispatch_4 {
    flow.executable.export public @foo_dispatch_4_slow_memcpy workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @foo_dispatch_4_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index) {
        %0 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32>
        flow.dispatch.tensor.store %6, %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @foo_dispatch_5 {
    flow.executable.export public @foo_dispatch_5_transpose_DxDx8x4x4x4x2x16_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @foo_dispatch_5_transpose_DxDx8x4x4x4x2x16_i32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) {
        %0 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg5, %arg6}
        %3 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%0, %1}
        %4 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %5 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%4, 8, 4, 4, %5, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg5, %arg6} -> tensor<?x8x4x4x?x4x2x16xi32>
        %7 = tensor.empty(%0, %1) : tensor<?x?x8x4x2x4x16x4xi32>
        %8 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%7 : tensor<?x?x8x4x2x4x16x4xi32>) {
        ^bb0(%in: i32, %out: i32):
          linalg.yield %in : i32
        } -> tensor<?x?x8x4x2x4x16x4xi32>
        flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%0, %1}
        return
      }
    }
  }
  flow.executable private @foo_dispatch_6 {
    flow.executable.export public @foo_dispatch_6 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @foo_dispatch_6(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%0, %1}
        %7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%2, %3}
        %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%0, %1} -> tensor<?x?x8x4x16x2x8xi8>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%2, %3, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%2, %3} -> tensor<?x?x4x2x4x16x2x8xi8>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%4, %5, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%4, %5} -> tensor<?x?x8x4x2x4x16x4xi32>
        %12 = iree_gpu.multi_mma %9, %10, %11 {indexing_maps = [#map6, #map7, #map8], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%4, %5, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%4, %5}
        return
      }
    }
  }
  flow.executable private @foo_dispatch_7 {
    flow.executable.export public @foo_dispatch_7_transpose_DxDx8x4x4x4x2x16_i32 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @foo_dispatch_7_transpose_DxDx8x4x4x4x2x16_i32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg1: index, %arg2: index, %arg3: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) {
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%0, %1}
        %3 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%0, %1}
        %4 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%0, %1} -> tensor<?x?x8x4x2x4x16x4xi32>
        %5 = tensor.empty(%0, %1) : tensor<?x8x4x4x?x4x2x16xi32>
        %6 = linalg.generic {indexing_maps = [#map5, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%4 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%5 : tensor<?x8x4x4x?x4x2x16xi32>) {
        ^bb0(%in: i32, %out: i32):
          linalg.yield %in : i32
        } -> tensor<?x8x4x4x?x4x2x16xi32>
        flow.dispatch.tensor.store %6, %3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%0, 8, 4, 4, %1, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%0, %1}
        return
      }
    }
  }
  flow.executable private @foo_dispatch_8 {
    flow.executable.export public @foo_dispatch_8_slow_memcpy workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @foo_dispatch_8_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) {
        %0 = flow.dispatch.workload.ordinal %arg1, 2 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 3 : index
        %2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg3, %arg4}
        %3 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%0, %1}
        %4 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg3, %arg4} -> tensor<?x?xi32>
        flow.dispatch.tensor.store %4, %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%0, %1}
        return
      }
    }
  }
  util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c0_i8 = arith.constant 0 : i8
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7}
    %9 = affine.apply #map9()[%0]
    %10 = affine.apply #map10()[%1]
    %11 = affine.apply #map11()[%0]
    %12 = affine.apply #map12()[%1]
    %13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12}
    %14 = flow.dispatch @foo_dispatch_0::@foo_dispatch_0_slow_memcpy[%0, %1, %11, %12](%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12}
    %15 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%9, %10}
    %16 = flow.dispatch @foo_dispatch_1::@foo_dispatch_1_transpose_DxDx8x16x2x4x8_i8[%9, %10, %9, %10](%15, %9, %10, %9, %10, %9, %10) : (tensor<?x8x16x?x2x4x8xi8>{%9, %10}, index, index, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10}
    %17 = affine.apply #map9()[%4]
    %18 = affine.apply #map10()[%3]
    %19 = affine.apply #map12()[%3]
    %20 = affine.apply #map11()[%4]
    %21 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%19, %20}
    %22 = flow.dispatch @foo_dispatch_2::@foo_dispatch_2_slow_memcpy[%3, %4, %19, %20](%5, %21, %3, %4, %19, %20) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%19, %20}, index, index, index, index) -> %21{%19, %20}
    %23 = flow.tensor.reshape %22 : tensor<?x?xi8>{%19, %20} -> tensor<?x2x4x8x?x4x2x16xi8>{%18, %17}
    %24 = flow.dispatch @foo_dispatch_3::@foo_dispatch_3_transpose_DxDx4x2x16x2x4x8_i8[%18, %17, %17, %18](%23, %18, %17, %17, %18, %18, %17) : (tensor<?x2x4x8x?x4x2x16xi8>{%18, %17}, index, index, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%17, %18}
    %25 = affine.apply #map9()[%6]
    %26 = affine.apply #map9()[%7]
    %27 = affine.apply #map11()[%6]
    %28 = affine.apply #map11()[%7]
    %29 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%27, %28}
    %30 = flow.dispatch @foo_dispatch_4::@foo_dispatch_4_slow_memcpy[%6, %7, %27, %28](%8, %29, %6, %7, %27, %28) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%27, %28}, index, index, index, index) -> %29{%27, %28}
    %31 = flow.tensor.reshape %30 : tensor<?x?xi32>{%27, %28} -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26}
    %32 = flow.dispatch @foo_dispatch_5::@foo_dispatch_5_transpose_DxDx8x4x4x4x2x16_i32[%25, %26, %25, %26](%31, %25, %26, %25, %26, %25, %26) : (tensor<?x8x4x4x?x4x2x16xi32>{%25, %26}, index, index, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}
    %33 = flow.dispatch @foo_dispatch_6::@foo_dispatch_6[%9, %10, %17, %18, %25, %26](%16, %24, %32, %9, %10, %17, %18, %25, %26) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%17, %18}, tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index, index, index, index, index) -> %32{%25, %26}
    %34 = flow.dispatch @foo_dispatch_7::@foo_dispatch_7_transpose_DxDx8x4x4x4x2x16_i32[%25, %26](%33, %25, %26) : (tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26}
    %35 = flow.tensor.reshape %34 : tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} -> tensor<?x?xi32>{%27, %28}
    %36 = flow.dispatch @foo_dispatch_8::@foo_dispatch_8_slow_memcpy[%27, %28, %6, %7](%35, %6, %7, %27, %28) : (tensor<?x?xi32>{%27, %28}, index, index, index, index) -> tensor<?x?xi32>{%6, %7}
    %37 = hal.tensor.export %36 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view
    util.return %37 : !hal.buffer_view
  }
 }