Skip to content

Instantly share code, notes, and snippets.

@pashu123
Created October 7, 2024 05:58
Show Gist options
  • Save pashu123/c5e6e0bd4a793000b805b1b904bc0688 to your computer and use it in GitHub Desktop.
Save pashu123/c5e6e0bd4a793000b805b1b904bc0688 to your computer and use it in GitHub Desktop.
// -----// IR Dump After LLVMCPUSelectLoweringStrategyPass (iree-llvmcpu-select-lowering-strategy) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>
#config1 = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>
#map = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1) -> (d1)>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>
module {
func.func @multi_result() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = linalg.matmul {lowering_config = #config1} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
%11 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%10, %7 : tensor<64x256xf32>, tensor<256xf32>) outs(%8 : tensor<64x256xf32>) attrs = {lowering_config = #config} {
^bb0(%in: f32, %in_0: f32, %out: f32):
%12 = arith.addf %in, %in_0 : f32
linalg.yield %12 : f32
} -> tensor<64x256xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
}
// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%12 = tensor.empty() : tensor<16x64xf32>
%13 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%12 : tensor<16x64xf32>) -> tensor<16x64xf32>
%14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<16x128xf32>, tensor<128x64xf32>) outs(%13 : tensor<16x64xf32>) -> tensor<16x64xf32>
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14, %extracted_slice_1 : tensor<16x64xf32>, tensor<64xf32>) outs(%extracted_slice_2 : tensor<16x64xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%16 = arith.addf %in, %in_3 : f32
linalg.yield %16 : f32
} -> tensor<16x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %15 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%12 = tensor.empty() : tensor<16x64xf32>
%13 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%12 : tensor<16x64xf32>) -> tensor<16x64xf32>
%14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<16x128xf32>, tensor<128x64xf32>) outs(%13 : tensor<16x64xf32>) -> tensor<16x64xf32>
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14, %extracted_slice_1 : tensor<16x64xf32>, tensor<64xf32>) outs(%extracted_slice_2 : tensor<16x64xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%16 = arith.addf %in, %in_3 : f32
linalg.yield %16 : f32
} -> tensor<16x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %15 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%12 = tensor.empty() : tensor<16x64xf32>
%13 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%12 : tensor<16x64xf32>) -> tensor<16x64xf32>
%14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<16x128xf32>, tensor<128x64xf32>) outs(%13 : tensor<16x64xf32>) -> tensor<16x64xf32>
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14, %extracted_slice_1 : tensor<16x64xf32>, tensor<64xf32>) outs(%extracted_slice_2 : tensor<16x64xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%16 = arith.addf %in, %in_3 : f32
linalg.yield %16 : f32
} -> tensor<16x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %15 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%12 = tensor.empty() : tensor<16x64xf32>
%13 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%12 : tensor<16x64xf32>) -> tensor<16x64xf32>
%14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<16x128xf32>, tensor<128x64xf32>) outs(%13 : tensor<16x64xf32>) -> tensor<16x64xf32>
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14, %extracted_slice_1 : tensor<16x64xf32>, tensor<64xf32>) outs(%extracted_slice_2 : tensor<16x64xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%16 = arith.addf %in, %in_3 : f32
linalg.yield %16 : f32
} -> tensor<16x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %15 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%12 = tensor.empty() : tensor<16x64xf32>
%13 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%12 : tensor<16x64xf32>) -> tensor<16x64xf32>
%14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<16x128xf32>, tensor<128x64xf32>) outs(%13 : tensor<16x64xf32>) -> tensor<16x64xf32>
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14, %extracted_slice_1 : tensor<16x64xf32>, tensor<64xf32>) outs(%extracted_slice_2 : tensor<16x64xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%16 = arith.addf %in, %in_3 : f32
linalg.yield %16 : f32
} -> tensor<16x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %15 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
%14 = tensor.empty() : tensor<8x32xf32>
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
%16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
^bb0(%in: f32, %in_7: f32, %out: f32):
%18 = arith.addf %in, %in_7 : f32
linalg.yield %18 : f32
} -> tensor<8x32xf32>
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
scf.yield %inserted_slice : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
%14 = tensor.empty() : tensor<8x32xf32>
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
%16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
^bb0(%in: f32, %in_7: f32, %out: f32):
%18 = arith.addf %in, %in_7 : f32
linalg.yield %18 : f32
} -> tensor<8x32xf32>
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
scf.yield %inserted_slice : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
%14 = tensor.empty() : tensor<8x32xf32>
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
%16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
^bb0(%in: f32, %in_7: f32, %out: f32):
%18 = arith.addf %in, %in_7 : f32
linalg.yield %18 : f32
} -> tensor<8x32xf32>
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
scf.yield %inserted_slice : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After LLVMCPUTilePass (iree-llvmcpu-tile) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
%14 = tensor.empty() : tensor<8x32xf32>
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
%16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
^bb0(%in: f32, %in_7: f32, %out: f32):
%18 = arith.addf %in, %in_7 : f32
linalg.yield %18 : f32
} -> tensor<8x32xf32>
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
scf.yield %inserted_slice : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
%14 = tensor.empty() : tensor<8x32xf32>
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
%16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
^bb0(%in: f32, %in_7: f32, %out: f32):
%18 = arith.addf %in, %in_7 : f32
linalg.yield %18 : f32
} -> tensor<8x32xf32>
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
scf.yield %inserted_slice : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
%14 = tensor.empty() : tensor<8x32xf32>
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
%16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
^bb0(%in: f32, %in_7: f32, %out: f32):
%18 = arith.addf %in, %in_7 : f32
linalg.yield %18 : f32
} -> tensor<8x32xf32>
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
scf.yield %inserted_slice : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
%14 = tensor.empty() : tensor<8x32xf32>
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
%16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
^bb0(%in: f32, %in_7: f32, %out: f32):
%18 = arith.addf %in, %in_7 : f32
linalg.yield %18 : f32
} -> tensor<8x32xf32>
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
scf.yield %inserted_slice : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After LLVMCPUSplitReductionPass (iree-llvmcpu-split-reduction) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
%14 = tensor.empty() : tensor<8x32xf32>
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
%16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
^bb0(%in: f32, %in_7: f32, %out: f32):
%18 = arith.addf %in, %in_7 : f32
linalg.yield %18 : f32
} -> tensor<8x32xf32>
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
scf.yield %inserted_slice : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After LLVMCPUTilePass (iree-llvmcpu-tile) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%c128 = arith.constant 128 : index
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32>
%extracted_slice_0 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32>
%12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x16xf32>, tensor<16x256xf32>) outs(%arg1 : tensor<64x256xf32>) -> tensor<64x256xf32>
scf.yield %12 : tensor<64x256xf32>
}
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
%14 = tensor.empty() : tensor<8x32xf32>
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
%16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) {
%extracted_slice_7 = tensor.extract_slice %extracted_slice_3[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32>
%extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32>
%18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_7, %extracted_slice_8 : tensor<8x16xf32>, tensor<16x32xf32>) outs(%arg8 : tensor<8x32xf32>) -> tensor<8x32xf32>
scf.yield %18 : tensor<8x32xf32>
}
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
^bb0(%in: f32, %in_7: f32, %out: f32):
%18 = arith.addf %in, %in_7 : f32
linalg.yield %18 : f32
} -> tensor<8x32xf32>
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
scf.yield %inserted_slice : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%c128 = arith.constant 128 : index
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32>
%extracted_slice_0 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32>
%12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x16xf32>, tensor<16x256xf32>) outs(%arg1 : tensor<64x256xf32>) -> tensor<64x256xf32>
scf.yield %12 : tensor<64x256xf32>
}
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
%14 = tensor.empty() : tensor<8x32xf32>
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
%16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) {
%extracted_slice_7 = tensor.extract_slice %extracted_slice_3[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32>
%extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32>
%18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_7, %extracted_slice_8 : tensor<8x16xf32>, tensor<16x32xf32>) outs(%arg8 : tensor<8x32xf32>) -> tensor<8x32xf32>
scf.yield %18 : tensor<8x32xf32>
}
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
^bb0(%in: f32, %in_7: f32, %out: f32):
%18 = arith.addf %in, %in_7 : f32
linalg.yield %18 : f32
} -> tensor<8x32xf32>
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
scf.yield %inserted_slice : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%c128 = arith.constant 128 : index
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32>
%extracted_slice_0 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32>
%12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x16xf32>, tensor<16x256xf32>) outs(%arg1 : tensor<64x256xf32>) -> tensor<64x256xf32>
scf.yield %12 : tensor<64x256xf32>
}
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
%14 = tensor.empty() : tensor<8x32xf32>
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
%16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) {
%extracted_slice_7 = tensor.extract_slice %extracted_slice_3[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32>
%extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32>
%18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_7, %extracted_slice_8 : tensor<8x16xf32>, tensor<16x32xf32>) outs(%arg8 : tensor<8x32xf32>) -> tensor<8x32xf32>
scf.yield %18 : tensor<8x32xf32>
}
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
^bb0(%in: f32, %in_7: f32, %out: f32):
%18 = arith.addf %in, %in_7 : f32
linalg.yield %18 : f32
} -> tensor<8x32xf32>
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
scf.yield %inserted_slice : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%c128 = arith.constant 128 : index
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32>
%extracted_slice_0 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32>
%12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x16xf32>, tensor<16x256xf32>) outs(%arg1 : tensor<64x256xf32>) -> tensor<64x256xf32>
scf.yield %12 : tensor<64x256xf32>
}
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
%14 = tensor.empty() : tensor<8x32xf32>
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
%16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) {
%extracted_slice_7 = tensor.extract_slice %extracted_slice_3[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32>
%extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32>
%18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_7, %extracted_slice_8 : tensor<8x16xf32>, tensor<16x32xf32>) outs(%arg8 : tensor<8x32xf32>) -> tensor<8x32xf32>
scf.yield %18 : tensor<8x32xf32>
}
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
^bb0(%in: f32, %in_7: f32, %out: f32):
%18 = arith.addf %in, %in_7 : f32
linalg.yield %18 : f32
} -> tensor<8x32xf32>
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
scf.yield %inserted_slice : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After LLVMCPUPeelPass (iree-llvmcpu-peel) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%c128 = arith.constant 128 : index
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32>
%extracted_slice_0 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32>
%12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x16xf32>, tensor<16x256xf32>) outs(%arg1 : tensor<64x256xf32>) -> tensor<64x256xf32>
scf.yield %12 : tensor<64x256xf32>
}
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
%14 = tensor.empty() : tensor<8x32xf32>
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
%16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) {
%extracted_slice_7 = tensor.extract_slice %extracted_slice_3[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32>
%extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32>
%18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_7, %extracted_slice_8 : tensor<8x16xf32>, tensor<16x32xf32>) outs(%arg8 : tensor<8x32xf32>) -> tensor<8x32xf32>
scf.yield %18 : tensor<8x32xf32>
}
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
^bb0(%in: f32, %in_7: f32, %out: f32):
%18 = arith.addf %in, %in_7 : f32
linalg.yield %18 : f32
} -> tensor<8x32xf32>
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
scf.yield %inserted_slice : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After TensorToVectorVectorizePadPass (iree-codegen-vectorize-tensor-pad) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%c128 = arith.constant 128 : index
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32>
%10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32>
%extracted_slice_0 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32>
%12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x16xf32>, tensor<16x256xf32>) outs(%arg1 : tensor<64x256xf32>) -> tensor<64x256xf32>
scf.yield %12 : tensor<64x256xf32>
}
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
%14 = tensor.empty() : tensor<8x32xf32>
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32>
%16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) {
%extracted_slice_7 = tensor.extract_slice %extracted_slice_3[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32>
%extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32>
%18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_7, %extracted_slice_8 : tensor<8x16xf32>, tensor<16x32xf32>) outs(%arg8 : tensor<8x32xf32>) -> tensor<8x32xf32>
scf.yield %18 : tensor<8x32xf32>
}
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} {
^bb0(%in: f32, %in_7: f32, %out: f32):
%18 = arith.addf %in, %in_7 : f32
linalg.yield %18 : f32
} -> tensor<8x32xf32>
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
scf.yield %inserted_slice : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%cst = arith.constant dense<0.000000e+00> : vector<8x32xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32>
%c128 = arith.constant 128 : index
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst_1 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = vector.transfer_write %cst_0, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32>
%10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32>
%extracted_slice_2 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32>
%12 = vector.transfer_read %extracted_slice[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<64x16xf32>, vector<64x16xf32>
%13 = vector.transfer_read %extracted_slice_2[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<16x256xf32>, vector<16x256xf32>
%14 = vector.transfer_read %arg1[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<64x256xf32>, vector<64x256xf32>
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %14 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32>
%16 = vector.transfer_write %15, %arg1[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32>
scf.yield %16 : tensor<64x256xf32>
}
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32>
%extracted_slice_2 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32>
%extracted_slice_3 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32>
%extracted_slice_4 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_4) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%extracted_slice_5 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32>
%extracted_slice_6 = tensor.extract_slice %extracted_slice_2[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32>
%14 = tensor.empty() : tensor<8x32xf32>
%15 = vector.transfer_write %cst, %14[%c0, %c0] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<8x32xf32>
%16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) {
%extracted_slice_9 = tensor.extract_slice %extracted_slice_5[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32>
%extracted_slice_10 = tensor.extract_slice %extracted_slice_6[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32>
%22 = vector.transfer_read %extracted_slice_9[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<8x16xf32>, vector<8x16xf32>
%23 = vector.transfer_read %extracted_slice_10[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<16x32xf32>, vector<16x32xf32>
%24 = vector.transfer_read %arg8[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<8x32xf32>, vector<8x32xf32>
%25 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %22, %23, %24 : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32>
%26 = vector.transfer_write %25, %arg8[%c0, %c0] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<8x32xf32>
scf.yield %26 : tensor<8x32xf32>
}
%extracted_slice_7 = tensor.extract_slice %extracted_slice_3[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32>
%extracted_slice_8 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32>
%17 = vector.transfer_read %16[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<8x32xf32>, vector<8x32xf32>
%18 = vector.transfer_read %extracted_slice_7[%c0], %cst_1 {in_bounds = [true]} : tensor<32xf32>, vector<32xf32>
%19 = vector.broadcast %18 : vector<32xf32> to vector<8x32xf32>
%20 = arith.addf %17, %19 : vector<8x32xf32>
%21 = vector.transfer_write %20, %extracted_slice_8[%c0, %c0] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<8x32xf32>
%inserted_slice = tensor.insert_slice %21 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32>
scf.yield %inserted_slice : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%cst = arith.constant dense<0.000000e+00> : vector<8x32xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32>
%c128 = arith.constant 128 : index
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst_1 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %cst_0) -> (vector<64x256xf32>) {
%12 = vector.transfer_read %5[%c0, %arg0], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<64x16xf32>
%13 = vector.transfer_read %6[%arg0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x256xf32>
%14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32>
scf.yield %14 : vector<64x256xf32>
}
%10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32>
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%14 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %cst) -> (vector<8x32xf32>) {
%20 = arith.addi %arg3, %arg0 : index
%21 = vector.transfer_read %5[%20, %arg7], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<8x16xf32>
%22 = arith.addi %arg5, %arg1 : index
%23 = vector.transfer_read %6[%arg7, %22], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x32xf32>
%24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %23, %arg8 : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32>
scf.yield %24 : vector<8x32xf32>
}
%15 = arith.addi %arg5, %arg1 : index
%16 = vector.transfer_read %7[%15], %cst_1 {in_bounds = [true]} : tensor<256xf32>, vector<32xf32>
%17 = vector.broadcast %16 : vector<32xf32> to vector<8x32xf32>
%18 = arith.addf %14, %17 : vector<8x32xf32>
%19 = vector.transfer_write %18, %arg6[%arg3, %arg5] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<16x64xf32>
scf.yield %19 : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%cst = arith.constant dense<0.000000e+00> : vector<8x32xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32>
%c128 = arith.constant 128 : index
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst_1 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %cst_0) -> (vector<64x256xf32>) {
%12 = vector.transfer_read %5[%c0, %arg0], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<64x16xf32>
%13 = vector.transfer_read %6[%arg0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x256xf32>
%14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32>
scf.yield %14 : vector<64x256xf32>
}
%10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32>
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%14 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %cst) -> (vector<8x32xf32>) {
%20 = arith.addi %arg3, %arg0 : index
%21 = vector.transfer_read %5[%20, %arg7], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<8x16xf32>
%22 = arith.addi %arg5, %arg1 : index
%23 = vector.transfer_read %6[%arg7, %22], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x32xf32>
%24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %23, %arg8 : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32>
scf.yield %24 : vector<8x32xf32>
}
%15 = arith.addi %arg5, %arg1 : index
%16 = vector.transfer_read %7[%15], %cst_1 {in_bounds = [true]} : tensor<256xf32>, vector<32xf32>
%17 = vector.broadcast %16 : vector<32xf32> to vector<8x32xf32>
%18 = arith.addf %14, %17 : vector<8x32xf32>
%19 = vector.transfer_write %18, %arg6[%arg3, %arg5] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<16x64xf32>
scf.yield %19 : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%cst = arith.constant dense<0.000000e+00> : vector<8x32xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32>
%c128 = arith.constant 128 : index
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst_1 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %cst_0) -> (vector<64x256xf32>) {
%12 = vector.transfer_read %5[%c0, %arg0], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<64x16xf32>
%13 = vector.transfer_read %6[%arg0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x256xf32>
%14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32>
scf.yield %14 : vector<64x256xf32>
}
%10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32>
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%14 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %cst) -> (vector<8x32xf32>) {
%20 = arith.addi %arg3, %arg0 : index
%21 = vector.transfer_read %5[%20, %arg7], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<8x16xf32>
%22 = arith.addi %arg5, %arg1 : index
%23 = vector.transfer_read %6[%arg7, %22], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x32xf32>
%24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %23, %arg8 : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32>
scf.yield %24 : vector<8x32xf32>
}
%15 = arith.addi %arg5, %arg1 : index
%16 = vector.transfer_read %7[%15], %cst_1 {in_bounds = [true]} : tensor<256xf32>, vector<32xf32>
%17 = vector.broadcast %16 : vector<32xf32> to vector<8x32xf32>
%18 = arith.addf %14, %17 : vector<8x32xf32>
%19 = vector.transfer_write %18, %arg6[%arg3, %arg5] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<16x64xf32>
scf.yield %19 : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
test.mlir:12:1: error: One or more operations with large vector sizes (32768 bytes) were found:
func.func @multi_result() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
^
<unknown>:0: note: %cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32>
<unknown>:0: note: %14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32>
test.mlir:27:9: note: scf.yield %14 : vector<64x256xf32>
%11 = linalg.matmul ins(%6, %7 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%10 : tensor<64x256xf32>) -> tensor<64x256xf32>
^
test.mlir:27:9: note:
%9 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %cst_0) -> (vector<64x256xf32>) {
%12 = vector.transfer_read %5[%c0, %arg0], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<64x16xf32>
%13 = vector.transfer_read %6[%arg0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x256xf32>
%14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32>
scf.yield %14 : vector<64x256xf32>
}
<unknown>:0: note: %10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32>
// -----// IR Dump After LLVMCPUVerifyVectorSizeLegalityPass Failed (iree-llvmcpu-verify-vector-size-legality) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%cst = arith.constant dense<0.000000e+00> : vector<8x32xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32>
%c128 = arith.constant 128 : index
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst_1 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %cst_0) -> (vector<64x256xf32>) {
%12 = vector.transfer_read %5[%c0, %arg0], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<64x16xf32>
%13 = vector.transfer_read %6[%arg0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x256xf32>
%14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32>
scf.yield %14 : vector<64x256xf32>
}
%10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32>
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%14 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %cst) -> (vector<8x32xf32>) {
%20 = arith.addi %arg3, %arg0 : index
%21 = vector.transfer_read %5[%20, %arg7], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<8x16xf32>
%22 = arith.addi %arg5, %arg1 : index
%23 = vector.transfer_read %6[%arg7, %22], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x32xf32>
%24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %23, %arg8 : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32>
scf.yield %24 : vector<8x32xf32>
}
%15 = arith.addi %arg5, %arg1 : index
%16 = vector.transfer_read %7[%15], %cst_1 {in_bounds = [true]} : tensor<256xf32>, vector<32xf32>
%17 = vector.broadcast %16 : vector<32xf32> to vector<8x32xf32>
%18 = arith.addf %14, %17 : vector<8x32xf32>
%19 = vector.transfer_write %18, %arg6[%arg3, %arg5] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<16x64xf32>
scf.yield %19 : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
// -----// IR Dump After LLVMCPULowerExecutableTargetPass Failed (iree-llvmcpu-lower-executable-target) //----- //
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} {
%cst = arith.constant dense<0.000000e+00> : vector<8x32xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32>
%c128 = arith.constant 128 : index
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%cst_1 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
%8 = tensor.empty() : tensor<64x256xf32>
%9 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %cst_0) -> (vector<64x256xf32>) {
%12 = vector.transfer_read %5[%c0, %arg0], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<64x16xf32>
%13 = vector.transfer_read %6[%arg0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x256xf32>
%14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32>
scf.yield %14 : vector<64x256xf32>
}
%10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32>
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32>
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice) -> (tensor<16x64xf32>) {
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) {
%14 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %cst) -> (vector<8x32xf32>) {
%20 = arith.addi %arg3, %arg0 : index
%21 = vector.transfer_read %5[%20, %arg7], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<8x16xf32>
%22 = arith.addi %arg5, %arg1 : index
%23 = vector.transfer_read %6[%arg7, %22], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x32xf32>
%24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %23, %arg8 : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32>
scf.yield %24 : vector<8x32xf32>
}
%15 = arith.addi %arg5, %arg1 : index
%16 = vector.transfer_read %7[%15], %cst_1 {in_bounds = [true]} : tensor<256xf32>, vector<32xf32>
%17 = vector.broadcast %16 : vector<32xf32> to vector<8x32xf32>
%18 = arith.addf %14, %17 : vector<8x32xf32>
%19 = vector.transfer_write %18, %arg6[%arg3, %arg5] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<16x64xf32>
scf.yield %19 : tensor<16x64xf32>
}
scf.yield %13 : tensor<16x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
return
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment