pashu123 · October 7, 2024 05:58
diff --git a/xyz.txt b/xyz.txt
 // -----// IR Dump After LLVMCPUSelectLoweringStrategyPass (iree-llvmcpu-select-lowering-strategy) //----- //
 #config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>
 #config1 = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>
 #map = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d1)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>
 module {
  func.func @multi_result() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} {
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
    %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
    %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
    %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
    %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
    %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
    %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
    %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
    %8 = tensor.empty() : tensor<64x256xf32>
    %9 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
    %10 = linalg.matmul {lowering_config = #config1} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
    %11 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%10, %7 : tensor<64x256xf32>, tensor<256xf32>) outs(%8 : tensor<64x256xf32>) attrs =  {lowering_config = #config} {
    ^bb0(%in: f32, %in_0: f32, %out: f32):
      %12 = arith.addf %in, %in_0 : f32
      linalg.yield %12 : f32
    } -> tensor<64x256xf32>
    flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
    flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
    return
  }
 }


 // -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %12 = tensor.empty() : tensor<16x64xf32>
    %13 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%12 : tensor<16x64xf32>) -> tensor<16x64xf32>
    %14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<16x128xf32>, tensor<128x64xf32>) outs(%13 : tensor<16x64xf32>) -> tensor<16x64xf32>
    %extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14, %extracted_slice_1 : tensor<16x64xf32>, tensor<64xf32>) outs(%extracted_slice_2 : tensor<16x64xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
    ^bb0(%in: f32, %in_3: f32, %out: f32):
      %16 = arith.addf %in, %in_3 : f32
      linalg.yield %16 : f32
    } -> tensor<16x64xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %15 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %12 = tensor.empty() : tensor<16x64xf32>
    %13 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%12 : tensor<16x64xf32>) -> tensor<16x64xf32>
    %14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<16x128xf32>, tensor<128x64xf32>) outs(%13 : tensor<16x64xf32>) -> tensor<16x64xf32>
    %extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14, %extracted_slice_1 : tensor<16x64xf32>, tensor<64xf32>) outs(%extracted_slice_2 : tensor<16x64xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
    ^bb0(%in: f32, %in_3: f32, %out: f32):
      %16 = arith.addf %in, %in_3 : f32
      linalg.yield %16 : f32
    } -> tensor<16x64xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %15 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %12 = tensor.empty() : tensor<16x64xf32>
    %13 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%12 : tensor<16x64xf32>) -> tensor<16x64xf32>
    %14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<16x128xf32>, tensor<128x64xf32>) outs(%13 : tensor<16x64xf32>) -> tensor<16x64xf32>
    %extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14, %extracted_slice_1 : tensor<16x64xf32>, tensor<64xf32>) outs(%extracted_slice_2 : tensor<16x64xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
    ^bb0(%in: f32, %in_3: f32, %out: f32):
      %16 = arith.addf %in, %in_3 : f32
      linalg.yield %16 : f32
    } -> tensor<16x64xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %15 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %12 = tensor.empty() : tensor<16x64xf32>
    %13 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%12 : tensor<16x64xf32>) -> tensor<16x64xf32>
    %14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<16x128xf32>, tensor<128x64xf32>) outs(%13 : tensor<16x64xf32>) -> tensor<16x64xf32>
    %extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14, %extracted_slice_1 : tensor<16x64xf32>, tensor<64xf32>) outs(%extracted_slice_2 : tensor<16x64xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
    ^bb0(%in: f32, %in_3: f32, %out: f32):
      %16 = arith.addf %in, %in_3 : f32
      linalg.yield %16 : f32
    } -> tensor<16x64xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %15 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %12 = tensor.empty() : tensor<16x64xf32>
    %13 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%12 : tensor<16x64xf32>) -> tensor<16x64xf32>
    %14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<16x128xf32>, tensor<128x64xf32>) outs(%13 : tensor<16x64xf32>) -> tensor<16x64xf32>
    %extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14, %extracted_slice_1 : tensor<16x64xf32>, tensor<64xf32>) outs(%extracted_slice_2 : tensor<16x64xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
    ^bb0(%in: f32, %in_3: f32, %out: f32):
      %16 = arith.addf %in, %in_3 : f32
      linalg.yield %16 : f32
    } -> tensor<16x64xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %15 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
        %extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
        %14 = tensor.empty() : tensor<8x32xf32>
        %15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
        %extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
        ^bb0(%in: f32, %in_7: f32, %out: f32):
          %18 = arith.addf %in, %in_7 : f32
          linalg.yield %18 : f32
        } -> tensor<8x32xf32>
        %inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
        scf.yield %inserted_slice : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
        %extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
        %14 = tensor.empty() : tensor<8x32xf32>
        %15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
        %extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
        ^bb0(%in: f32, %in_7: f32, %out: f32):
          %18 = arith.addf %in, %in_7 : f32
          linalg.yield %18 : f32
        } -> tensor<8x32xf32>
        %inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
        scf.yield %inserted_slice : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
        %extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
        %14 = tensor.empty() : tensor<8x32xf32>
        %15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
        %extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
        ^bb0(%in: f32, %in_7: f32, %out: f32):
          %18 = arith.addf %in, %in_7 : f32
          linalg.yield %18 : f32
        } -> tensor<8x32xf32>
        %inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
        scf.yield %inserted_slice : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After LLVMCPUTilePass (iree-llvmcpu-tile) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
        %extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
        %14 = tensor.empty() : tensor<8x32xf32>
        %15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
        %extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
        ^bb0(%in: f32, %in_7: f32, %out: f32):
          %18 = arith.addf %in, %in_7 : f32
          linalg.yield %18 : f32
        } -> tensor<8x32xf32>
        %inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
        scf.yield %inserted_slice : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
        %extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
        %14 = tensor.empty() : tensor<8x32xf32>
        %15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
        %extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
        ^bb0(%in: f32, %in_7: f32, %out: f32):
          %18 = arith.addf %in, %in_7 : f32
          linalg.yield %18 : f32
        } -> tensor<8x32xf32>
        %inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
        scf.yield %inserted_slice : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
        %extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
        %14 = tensor.empty() : tensor<8x32xf32>
        %15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
        %extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
        ^bb0(%in: f32, %in_7: f32, %out: f32):
          %18 = arith.addf %in, %in_7 : f32
          linalg.yield %18 : f32
        } -> tensor<8x32xf32>
        %inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
        scf.yield %inserted_slice : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
        %extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
        %14 = tensor.empty() : tensor<8x32xf32>
        %15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
        %extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
        ^bb0(%in: f32, %in_7: f32, %out: f32):
          %18 = arith.addf %in, %in_7 : f32
          linalg.yield %18 : f32
        } -> tensor<8x32xf32>
        %inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
        scf.yield %inserted_slice : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After LLVMCPUSplitReductionPass (iree-llvmcpu-split-reduction) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
        %extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
        %14 = tensor.empty() : tensor<8x32xf32>
        %15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
        %extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
        ^bb0(%in: f32, %in_7: f32, %out: f32):
          %18 = arith.addf %in, %in_7 : f32
          linalg.yield %18 : f32
        } -> tensor<8x32xf32>
        %inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
        scf.yield %inserted_slice : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After LLVMCPUTilePass (iree-llvmcpu-tile) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %c128 = arith.constant 128 : index
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32>
    %extracted_slice_0 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32>
    %12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x16xf32>, tensor<16x256xf32>) outs(%arg1 : tensor<64x256xf32>) -> tensor<64x256xf32>
    scf.yield %12 : tensor<64x256xf32>
  }
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
        %extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
        %14 = tensor.empty() : tensor<8x32xf32>
        %15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) {
          %extracted_slice_7 = tensor.extract_slice %extracted_slice_3[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32>
          %extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32>
          %18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_7, %extracted_slice_8 : tensor<8x16xf32>, tensor<16x32xf32>) outs(%arg8 : tensor<8x32xf32>) -> tensor<8x32xf32>
          scf.yield %18 : tensor<8x32xf32>
        }
        %extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
        %extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
        ^bb0(%in: f32, %in_7: f32, %out: f32):
          %18 = arith.addf %in, %in_7 : f32
          linalg.yield %18 : f32
        } -> tensor<8x32xf32>
        %inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
        scf.yield %inserted_slice : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %c128 = arith.constant 128 : index
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32>
    %extracted_slice_0 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32>
    %12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x16xf32>, tensor<16x256xf32>) outs(%arg1 : tensor<64x256xf32>) -> tensor<64x256xf32>
    scf.yield %12 : tensor<64x256xf32>
  }
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
        %extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
        %14 = tensor.empty() : tensor<8x32xf32>
        %15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) {
          %extracted_slice_7 = tensor.extract_slice %extracted_slice_3[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32>
          %extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32>
          %18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_7, %extracted_slice_8 : tensor<8x16xf32>, tensor<16x32xf32>) outs(%arg8 : tensor<8x32xf32>) -> tensor<8x32xf32>
          scf.yield %18 : tensor<8x32xf32>
        }
        %extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
        %extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
        ^bb0(%in: f32, %in_7: f32, %out: f32):
          %18 = arith.addf %in, %in_7 : f32
          linalg.yield %18 : f32
        } -> tensor<8x32xf32>
        %inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
        scf.yield %inserted_slice : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %c128 = arith.constant 128 : index
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32>
    %extracted_slice_0 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32>
    %12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x16xf32>, tensor<16x256xf32>) outs(%arg1 : tensor<64x256xf32>) -> tensor<64x256xf32>
    scf.yield %12 : tensor<64x256xf32>
  }
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
        %extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
        %14 = tensor.empty() : tensor<8x32xf32>
        %15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) {
          %extracted_slice_7 = tensor.extract_slice %extracted_slice_3[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32>
          %extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32>
          %18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_7, %extracted_slice_8 : tensor<8x16xf32>, tensor<16x32xf32>) outs(%arg8 : tensor<8x32xf32>) -> tensor<8x32xf32>
          scf.yield %18 : tensor<8x32xf32>
        }
        %extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
        %extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
        ^bb0(%in: f32, %in_7: f32, %out: f32):
          %18 = arith.addf %in, %in_7 : f32
          linalg.yield %18 : f32
        } -> tensor<8x32xf32>
        %inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
        scf.yield %inserted_slice : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %c128 = arith.constant 128 : index
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32>
    %extracted_slice_0 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32>
    %12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x16xf32>, tensor<16x256xf32>) outs(%arg1 : tensor<64x256xf32>) -> tensor<64x256xf32>
    scf.yield %12 : tensor<64x256xf32>
  }
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
        %extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
        %14 = tensor.empty() : tensor<8x32xf32>
        %15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) {
          %extracted_slice_7 = tensor.extract_slice %extracted_slice_3[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32>
          %extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32>
          %18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_7, %extracted_slice_8 : tensor<8x16xf32>, tensor<16x32xf32>) outs(%arg8 : tensor<8x32xf32>) -> tensor<8x32xf32>
          scf.yield %18 : tensor<8x32xf32>
        }
        %extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
        %extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
        ^bb0(%in: f32, %in_7: f32, %out: f32):
          %18 = arith.addf %in, %in_7 : f32
          linalg.yield %18 : f32
        } -> tensor<8x32xf32>
        %inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
        scf.yield %inserted_slice : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After LLVMCPUPeelPass (iree-llvmcpu-peel) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %c128 = arith.constant 128 : index
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32>
    %extracted_slice_0 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32>
    %12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x16xf32>, tensor<16x256xf32>) outs(%arg1 : tensor<64x256xf32>) -> tensor<64x256xf32>
    scf.yield %12 : tensor<64x256xf32>
  }
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
        %extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
        %14 = tensor.empty() : tensor<8x32xf32>
        %15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) {
          %extracted_slice_7 = tensor.extract_slice %extracted_slice_3[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32>
          %extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32>
          %18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_7, %extracted_slice_8 : tensor<8x16xf32>, tensor<16x32xf32>) outs(%arg8 : tensor<8x32xf32>) -> tensor<8x32xf32>
          scf.yield %18 : tensor<8x32xf32>
        }
        %extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
        %extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
        ^bb0(%in: f32, %in_7: f32, %out: f32):
          %18 = arith.addf %in, %in_7 : f32
          linalg.yield %18 : f32
        } -> tensor<8x32xf32>
        %inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
        scf.yield %inserted_slice : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After TensorToVectorVectorizePadPass (iree-codegen-vectorize-tensor-pad) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %c128 = arith.constant 128 : index
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
  %10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32>
    %extracted_slice_0 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32>
    %12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x16xf32>, tensor<16x256xf32>) outs(%arg1 : tensor<64x256xf32>) -> tensor<64x256xf32>
    scf.yield %12 : tensor<64x256xf32>
  }
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
        %extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
        %14 = tensor.empty() : tensor<8x32xf32>
        %15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
        %16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) {
          %extracted_slice_7 = tensor.extract_slice %extracted_slice_3[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32>
          %extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32>
          %18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_7, %extracted_slice_8 : tensor<8x16xf32>, tensor<16x32xf32>) outs(%arg8 : tensor<8x32xf32>) -> tensor<8x32xf32>
          scf.yield %18 : tensor<8x32xf32>
        }
        %extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
        %extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
        ^bb0(%in: f32, %in_7: f32, %out: f32):
          %18 = arith.addf %in, %in_7 : f32
          linalg.yield %18 : f32
        } -> tensor<8x32xf32>
        %inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
        scf.yield %inserted_slice : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<8x32xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32>
  %c128 = arith.constant 128 : index
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = vector.transfer_write %cst_0, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32>
  %10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32>
    %extracted_slice_2 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32>
    %12 = vector.transfer_read %extracted_slice[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<64x16xf32>, vector<64x16xf32>
    %13 = vector.transfer_read %extracted_slice_2[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<16x256xf32>, vector<16x256xf32>
    %14 = vector.transfer_read %arg1[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<64x256xf32>, vector<64x256xf32>
    %15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %14 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32>
    %16 = vector.transfer_write %15, %arg1[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32>
    scf.yield %16 : tensor<64x256xf32>
  }
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
    %extracted_slice_2 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
    %extracted_slice_3 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
    %extracted_slice_4 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_4) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %extracted_slice_5 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
        %extracted_slice_6 = tensor.extract_slice %extracted_slice_2[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
        %14 = tensor.empty() : tensor<8x32xf32>
        %15 = vector.transfer_write %cst, %14[%c0, %c0] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<8x32xf32>
        %16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) {
          %extracted_slice_9 = tensor.extract_slice %extracted_slice_5[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32>
          %extracted_slice_10 = tensor.extract_slice %extracted_slice_6[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32>
          %22 = vector.transfer_read %extracted_slice_9[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<8x16xf32>, vector<8x16xf32>
          %23 = vector.transfer_read %extracted_slice_10[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<16x32xf32>, vector<16x32xf32>
          %24 = vector.transfer_read %arg8[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<8x32xf32>, vector<8x32xf32>
          %25 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %22, %23, %24 : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32>
          %26 = vector.transfer_write %25, %arg8[%c0, %c0] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<8x32xf32>
          scf.yield %26 : tensor<8x32xf32>
        }
        %extracted_slice_7 = tensor.extract_slice %extracted_slice_3[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
        %extracted_slice_8 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
        %17 = vector.transfer_read %16[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<8x32xf32>, vector<8x32xf32>
        %18 = vector.transfer_read %extracted_slice_7[%c0], %cst_1 {in_bounds = [true]} : tensor<32xf32>, vector<32xf32>
        %19 = vector.broadcast %18 : vector<32xf32> to vector<8x32xf32>
        %20 = arith.addf %17, %19 : vector<8x32xf32>
        %21 = vector.transfer_write %20, %extracted_slice_8[%c0, %c0] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<8x32xf32>
        %inserted_slice = tensor.insert_slice %21 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
        scf.yield %inserted_slice : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<8x32xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32>
  %c128 = arith.constant 128 : index
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %cst_0) -> (vector<64x256xf32>) {
    %12 = vector.transfer_read %5[%c0, %arg0], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<64x16xf32>
    %13 = vector.transfer_read %6[%arg0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x256xf32>
    %14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32>
    scf.yield %14 : vector<64x256xf32>
  }
  %10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32>
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %14 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %cst) -> (vector<8x32xf32>) {
          %20 = arith.addi %arg3, %arg0 : index
          %21 = vector.transfer_read %5[%20, %arg7], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<8x16xf32>
          %22 = arith.addi %arg5, %arg1 : index
          %23 = vector.transfer_read %6[%arg7, %22], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x32xf32>
          %24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %23, %arg8 : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32>
          scf.yield %24 : vector<8x32xf32>
        }
        %15 = arith.addi %arg5, %arg1 : index
        %16 = vector.transfer_read %7[%15], %cst_1 {in_bounds = [true]} : tensor<256xf32>, vector<32xf32>
        %17 = vector.broadcast %16 : vector<32xf32> to vector<8x32xf32>
        %18 = arith.addf %14, %17 : vector<8x32xf32>
        %19 = vector.transfer_write %18, %arg6[%arg3, %arg5] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<16x64xf32>
        scf.yield %19 : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<8x32xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32>
  %c128 = arith.constant 128 : index
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %cst_0) -> (vector<64x256xf32>) {
    %12 = vector.transfer_read %5[%c0, %arg0], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<64x16xf32>
    %13 = vector.transfer_read %6[%arg0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x256xf32>
    %14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32>
    scf.yield %14 : vector<64x256xf32>
  }
  %10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32>
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %14 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %cst) -> (vector<8x32xf32>) {
          %20 = arith.addi %arg3, %arg0 : index
          %21 = vector.transfer_read %5[%20, %arg7], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<8x16xf32>
          %22 = arith.addi %arg5, %arg1 : index
          %23 = vector.transfer_read %6[%arg7, %22], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x32xf32>
          %24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %23, %arg8 : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32>
          scf.yield %24 : vector<8x32xf32>
        }
        %15 = arith.addi %arg5, %arg1 : index
        %16 = vector.transfer_read %7[%15], %cst_1 {in_bounds = [true]} : tensor<256xf32>, vector<32xf32>
        %17 = vector.broadcast %16 : vector<32xf32> to vector<8x32xf32>
        %18 = arith.addf %14, %17 : vector<8x32xf32>
        %19 = vector.transfer_write %18, %arg6[%arg3, %arg5] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<16x64xf32>
        scf.yield %19 : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<8x32xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32>
  %c128 = arith.constant 128 : index
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %cst_0) -> (vector<64x256xf32>) {
    %12 = vector.transfer_read %5[%c0, %arg0], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<64x16xf32>
    %13 = vector.transfer_read %6[%arg0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x256xf32>
    %14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32>
    scf.yield %14 : vector<64x256xf32>
  }
  %10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32>
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %14 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %cst) -> (vector<8x32xf32>) {
          %20 = arith.addi %arg3, %arg0 : index
          %21 = vector.transfer_read %5[%20, %arg7], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<8x16xf32>
          %22 = arith.addi %arg5, %arg1 : index
          %23 = vector.transfer_read %6[%arg7, %22], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x32xf32>
          %24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %23, %arg8 : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32>
          scf.yield %24 : vector<8x32xf32>
        }
        %15 = arith.addi %arg5, %arg1 : index
        %16 = vector.transfer_read %7[%15], %cst_1 {in_bounds = [true]} : tensor<256xf32>, vector<32xf32>
        %17 = vector.broadcast %16 : vector<32xf32> to vector<8x32xf32>
        %18 = arith.addf %14, %17 : vector<8x32xf32>
        %19 = vector.transfer_write %18, %arg6[%arg3, %arg5] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<16x64xf32>
        scf.yield %19 : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 test.mlir:12:1: error: One or more operations with large vector sizes (32768 bytes) were found:

 func.func @multi_result() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
 ^
 <unknown>:0: note:   %cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32>

 <unknown>:0: note:   %14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32>

 test.mlir:27:9: note:   scf.yield %14 : vector<64x256xf32>

  %11 = linalg.matmul ins(%6, %7 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%10 : tensor<64x256xf32>) -> tensor<64x256xf32>
        ^
 test.mlir:27:9: note:   
 %9 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %cst_0) -> (vector<64x256xf32>) {
  %12 = vector.transfer_read %5[%c0, %arg0], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<64x16xf32>
  %13 = vector.transfer_read %6[%arg0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x256xf32>
  %14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32>
  scf.yield %14 : vector<64x256xf32>
 }

 <unknown>:0: note:   %10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32>

 // -----// IR Dump After LLVMCPUVerifyVectorSizeLegalityPass Failed (iree-llvmcpu-verify-vector-size-legality) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<8x32xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32>
  %c128 = arith.constant 128 : index
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %cst_0) -> (vector<64x256xf32>) {
    %12 = vector.transfer_read %5[%c0, %arg0], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<64x16xf32>
    %13 = vector.transfer_read %6[%arg0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x256xf32>
    %14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32>
    scf.yield %14 : vector<64x256xf32>
  }
  %10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32>
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %14 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %cst) -> (vector<8x32xf32>) {
          %20 = arith.addi %arg3, %arg0 : index
          %21 = vector.transfer_read %5[%20, %arg7], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<8x16xf32>
          %22 = arith.addi %arg5, %arg1 : index
          %23 = vector.transfer_read %6[%arg7, %22], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x32xf32>
          %24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %23, %arg8 : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32>
          scf.yield %24 : vector<8x32xf32>
        }
        %15 = arith.addi %arg5, %arg1 : index
        %16 = vector.transfer_read %7[%15], %cst_1 {in_bounds = [true]} : tensor<256xf32>, vector<32xf32>
        %17 = vector.broadcast %16 : vector<32xf32> to vector<8x32xf32>
        %18 = arith.addf %14, %17 : vector<8x32xf32>
        %19 = vector.transfer_write %18, %arg6[%arg3, %arg5] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<16x64xf32>
        scf.yield %19 : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }

 // -----// IR Dump After LLVMCPULowerExecutableTargetPass Failed (iree-llvmcpu-lower-executable-target) //----- //
 func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<8x32xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32>
  %c128 = arith.constant 128 : index
  %c32 = arith.constant 32 : index
  %c8 = arith.constant 8 : index
  %c64 = arith.constant 64 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
  %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
  %8 = tensor.empty() : tensor<64x256xf32>
  %9 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %cst_0) -> (vector<64x256xf32>) {
    %12 = vector.transfer_read %5[%c0, %arg0], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<64x16xf32>
    %13 = vector.transfer_read %6[%arg0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x256xf32>
    %14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32>
    scf.yield %14 : vector<64x256xf32>
  }
  %10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32>
  %11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
    %extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
    %12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice) -> (tensor<16x64xf32>) {
      %13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
        %14 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %cst) -> (vector<8x32xf32>) {
          %20 = arith.addi %arg3, %arg0 : index
          %21 = vector.transfer_read %5[%20, %arg7], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<8x16xf32>
          %22 = arith.addi %arg5, %arg1 : index
          %23 = vector.transfer_read %6[%arg7, %22], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x32xf32>
          %24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %23, %arg8 : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32>
          scf.yield %24 : vector<8x32xf32>
        }
        %15 = arith.addi %arg5, %arg1 : index
        %16 = vector.transfer_read %7[%15], %cst_1 {in_bounds = [true]} : tensor<256xf32>, vector<32xf32>
        %17 = vector.broadcast %16 : vector<32xf32> to vector<8x32xf32>
        %18 = arith.addf %14, %17 : vector<8x32xf32>
        %19 = vector.transfer_write %18, %arg6[%arg3, %arg5] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<16x64xf32>
        scf.yield %19 : tensor<16x64xf32>
      }
      scf.yield %13 : tensor<16x64xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
  return
 }