Created
October 7, 2024 05:58
-
-
Save pashu123/c5e6e0bd4a793000b805b1b904bc0688 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After LLVMCPUSelectLoweringStrategyPass (iree-llvmcpu-select-lowering-strategy) //----- // | |
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]> | |
#config1 = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}> | |
#map = affine_map<(d0, d1) -> (d0, d1)> | |
#map1 = affine_map<(d0, d1) -> (d1)> | |
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}> | |
module { | |
func.func @multi_result() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = linalg.matmul {lowering_config = #config1} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%11 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%10, %7 : tensor<64x256xf32>, tensor<256xf32>) outs(%8 : tensor<64x256xf32>) attrs = {lowering_config = #config} { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%12 = arith.addf %in, %in_0 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<64x256xf32> | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
} | |
// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%12 = tensor.empty() : tensor<16x64xf32> | |
%13 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%12 : tensor<16x64xf32>) -> tensor<16x64xf32> | |
%14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<16x128xf32>, tensor<128x64xf32>) outs(%13 : tensor<16x64xf32>) -> tensor<16x64xf32> | |
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14, %extracted_slice_1 : tensor<16x64xf32>, tensor<64xf32>) outs(%extracted_slice_2 : tensor<16x64xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} { | |
^bb0(%in: f32, %in_3: f32, %out: f32): | |
%16 = arith.addf %in, %in_3 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<16x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %15 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%12 = tensor.empty() : tensor<16x64xf32> | |
%13 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%12 : tensor<16x64xf32>) -> tensor<16x64xf32> | |
%14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<16x128xf32>, tensor<128x64xf32>) outs(%13 : tensor<16x64xf32>) -> tensor<16x64xf32> | |
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14, %extracted_slice_1 : tensor<16x64xf32>, tensor<64xf32>) outs(%extracted_slice_2 : tensor<16x64xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} { | |
^bb0(%in: f32, %in_3: f32, %out: f32): | |
%16 = arith.addf %in, %in_3 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<16x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %15 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%12 = tensor.empty() : tensor<16x64xf32> | |
%13 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%12 : tensor<16x64xf32>) -> tensor<16x64xf32> | |
%14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<16x128xf32>, tensor<128x64xf32>) outs(%13 : tensor<16x64xf32>) -> tensor<16x64xf32> | |
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14, %extracted_slice_1 : tensor<16x64xf32>, tensor<64xf32>) outs(%extracted_slice_2 : tensor<16x64xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} { | |
^bb0(%in: f32, %in_3: f32, %out: f32): | |
%16 = arith.addf %in, %in_3 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<16x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %15 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%12 = tensor.empty() : tensor<16x64xf32> | |
%13 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%12 : tensor<16x64xf32>) -> tensor<16x64xf32> | |
%14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<16x128xf32>, tensor<128x64xf32>) outs(%13 : tensor<16x64xf32>) -> tensor<16x64xf32> | |
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14, %extracted_slice_1 : tensor<16x64xf32>, tensor<64xf32>) outs(%extracted_slice_2 : tensor<16x64xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} { | |
^bb0(%in: f32, %in_3: f32, %out: f32): | |
%16 = arith.addf %in, %in_3 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<16x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %15 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%12 = tensor.empty() : tensor<16x64xf32> | |
%13 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%12 : tensor<16x64xf32>) -> tensor<16x64xf32> | |
%14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<16x128xf32>, tensor<128x64xf32>) outs(%13 : tensor<16x64xf32>) -> tensor<16x64xf32> | |
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14, %extracted_slice_1 : tensor<16x64xf32>, tensor<64xf32>) outs(%extracted_slice_2 : tensor<16x64xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} { | |
^bb0(%in: f32, %in_3: f32, %out: f32): | |
%16 = arith.addf %in, %in_3 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<16x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %15 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32> | |
%14 = tensor.empty() : tensor<8x32xf32> | |
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32> | |
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%18 = arith.addf %in, %in_7 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<8x32xf32> | |
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32> | |
scf.yield %inserted_slice : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32> | |
%14 = tensor.empty() : tensor<8x32xf32> | |
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32> | |
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%18 = arith.addf %in, %in_7 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<8x32xf32> | |
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32> | |
scf.yield %inserted_slice : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32> | |
%14 = tensor.empty() : tensor<8x32xf32> | |
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32> | |
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%18 = arith.addf %in, %in_7 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<8x32xf32> | |
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32> | |
scf.yield %inserted_slice : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After LLVMCPUTilePass (iree-llvmcpu-tile) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32> | |
%14 = tensor.empty() : tensor<8x32xf32> | |
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32> | |
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%18 = arith.addf %in, %in_7 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<8x32xf32> | |
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32> | |
scf.yield %inserted_slice : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32> | |
%14 = tensor.empty() : tensor<8x32xf32> | |
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32> | |
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%18 = arith.addf %in, %in_7 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<8x32xf32> | |
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32> | |
scf.yield %inserted_slice : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32> | |
%14 = tensor.empty() : tensor<8x32xf32> | |
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32> | |
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%18 = arith.addf %in, %in_7 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<8x32xf32> | |
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32> | |
scf.yield %inserted_slice : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32> | |
%14 = tensor.empty() : tensor<8x32xf32> | |
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32> | |
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%18 = arith.addf %in, %in_7 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<8x32xf32> | |
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32> | |
scf.yield %inserted_slice : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After LLVMCPUSplitReductionPass (iree-llvmcpu-split-reduction) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%5, %6 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32> | |
%14 = tensor.empty() : tensor<8x32xf32> | |
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%16 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<8x128xf32>, tensor<128x32xf32>) outs(%15 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32> | |
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%18 = arith.addf %in, %in_7 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<8x32xf32> | |
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32> | |
scf.yield %inserted_slice : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After LLVMCPUTilePass (iree-llvmcpu-tile) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32> | |
%12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x16xf32>, tensor<16x256xf32>) outs(%arg1 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
scf.yield %12 : tensor<64x256xf32> | |
} | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32> | |
%14 = tensor.empty() : tensor<8x32xf32> | |
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) { | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice_3[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32> | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32> | |
%18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_7, %extracted_slice_8 : tensor<8x16xf32>, tensor<16x32xf32>) outs(%arg8 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
scf.yield %18 : tensor<8x32xf32> | |
} | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32> | |
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%18 = arith.addf %in, %in_7 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<8x32xf32> | |
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32> | |
scf.yield %inserted_slice : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32> | |
%12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x16xf32>, tensor<16x256xf32>) outs(%arg1 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
scf.yield %12 : tensor<64x256xf32> | |
} | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32> | |
%14 = tensor.empty() : tensor<8x32xf32> | |
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) { | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice_3[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32> | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32> | |
%18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_7, %extracted_slice_8 : tensor<8x16xf32>, tensor<16x32xf32>) outs(%arg8 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
scf.yield %18 : tensor<8x32xf32> | |
} | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32> | |
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%18 = arith.addf %in, %in_7 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<8x32xf32> | |
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32> | |
scf.yield %inserted_slice : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32> | |
%12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x16xf32>, tensor<16x256xf32>) outs(%arg1 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
scf.yield %12 : tensor<64x256xf32> | |
} | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32> | |
%14 = tensor.empty() : tensor<8x32xf32> | |
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) { | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice_3[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32> | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32> | |
%18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_7, %extracted_slice_8 : tensor<8x16xf32>, tensor<16x32xf32>) outs(%arg8 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
scf.yield %18 : tensor<8x32xf32> | |
} | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32> | |
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%18 = arith.addf %in, %in_7 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<8x32xf32> | |
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32> | |
scf.yield %inserted_slice : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32> | |
%12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x16xf32>, tensor<16x256xf32>) outs(%arg1 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
scf.yield %12 : tensor<64x256xf32> | |
} | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32> | |
%14 = tensor.empty() : tensor<8x32xf32> | |
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) { | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice_3[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32> | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32> | |
%18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_7, %extracted_slice_8 : tensor<8x16xf32>, tensor<16x32xf32>) outs(%arg8 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
scf.yield %18 : tensor<8x32xf32> | |
} | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32> | |
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%18 = arith.addf %in, %in_7 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<8x32xf32> | |
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32> | |
scf.yield %inserted_slice : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After LLVMCPUPeelPass (iree-llvmcpu-peel) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32> | |
%12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x16xf32>, tensor<16x256xf32>) outs(%arg1 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
scf.yield %12 : tensor<64x256xf32> | |
} | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32> | |
%14 = tensor.empty() : tensor<8x32xf32> | |
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) { | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice_3[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32> | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32> | |
%18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_7, %extracted_slice_8 : tensor<8x16xf32>, tensor<16x32xf32>) outs(%arg8 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
scf.yield %18 : tensor<8x32xf32> | |
} | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32> | |
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%18 = arith.addf %in, %in_7 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<8x32xf32> | |
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32> | |
scf.yield %inserted_slice : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After TensorToVectorVectorizePadPass (iree-codegen-vectorize-tensor-pad) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%8 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
%10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32> | |
%12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x16xf32>, tensor<16x256xf32>) outs(%arg1 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
scf.yield %12 : tensor<64x256xf32> | |
} | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_0 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%extracted_slice_1 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_2) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_0[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32> | |
%14 = tensor.empty() : tensor<8x32xf32> | |
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} ins(%cst : f32) outs(%14 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
%16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) { | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice_3[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32> | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32> | |
%18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64, 0], [16, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>} ins(%extracted_slice_7, %extracted_slice_8 : tensor<8x16xf32>, tensor<16x32xf32>) outs(%arg8 : tensor<8x32xf32>) -> tensor<8x32xf32> | |
scf.yield %18 : tensor<8x32xf32> | |
} | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32> | |
%extracted_slice_6 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %extracted_slice_5 : tensor<8x32xf32>, tensor<32xf32>) outs(%extracted_slice_6 : tensor<8x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 64], [8, 32], [0, 0], [0, 0]]>} { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%18 = arith.addf %in, %in_7 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<8x32xf32> | |
%inserted_slice = tensor.insert_slice %17 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32> | |
scf.yield %inserted_slice : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<8x32xf32> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32> | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = vector.transfer_write %cst_0, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32> | |
%10 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %9) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[0, %arg0] [64, 16] [1, 1] : tensor<64x128xf32> to tensor<64x16xf32> | |
%extracted_slice_2 = tensor.extract_slice %6[%arg0, 0] [16, 256] [1, 1] : tensor<128x256xf32> to tensor<16x256xf32> | |
%12 = vector.transfer_read %extracted_slice[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<64x16xf32>, vector<64x16xf32> | |
%13 = vector.transfer_read %extracted_slice_2[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<16x256xf32>, vector<16x256xf32> | |
%14 = vector.transfer_read %arg1[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<64x256xf32>, vector<64x256xf32> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %14 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32> | |
%16 = vector.transfer_write %15, %arg1[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32> | |
scf.yield %16 : tensor<64x256xf32> | |
} | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, 0] [16, 128] [1, 1] : tensor<64x128xf32> to tensor<16x128xf32> | |
%extracted_slice_2 = tensor.extract_slice %6[0, %arg1] [128, 64] [1, 1] : tensor<128x256xf32> to tensor<128x64xf32> | |
%extracted_slice_3 = tensor.extract_slice %7[%arg1] [64] [1] : tensor<256xf32> to tensor<64xf32> | |
%extracted_slice_4 = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice_4) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice[%arg3, 0] [8, 128] [1, 1] : tensor<16x128xf32> to tensor<8x128xf32> | |
%extracted_slice_6 = tensor.extract_slice %extracted_slice_2[0, %arg5] [128, 32] [1, 1] : tensor<128x64xf32> to tensor<128x32xf32> | |
%14 = tensor.empty() : tensor<8x32xf32> | |
%15 = vector.transfer_write %cst, %14[%c0, %c0] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<8x32xf32> | |
%16 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %15) -> (tensor<8x32xf32>) { | |
%extracted_slice_9 = tensor.extract_slice %extracted_slice_5[0, %arg7] [8, 16] [1, 1] : tensor<8x128xf32> to tensor<8x16xf32> | |
%extracted_slice_10 = tensor.extract_slice %extracted_slice_6[%arg7, 0] [16, 32] [1, 1] : tensor<128x32xf32> to tensor<16x32xf32> | |
%22 = vector.transfer_read %extracted_slice_9[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<8x16xf32>, vector<8x16xf32> | |
%23 = vector.transfer_read %extracted_slice_10[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<16x32xf32>, vector<16x32xf32> | |
%24 = vector.transfer_read %arg8[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<8x32xf32>, vector<8x32xf32> | |
%25 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %22, %23, %24 : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32> | |
%26 = vector.transfer_write %25, %arg8[%c0, %c0] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<8x32xf32> | |
scf.yield %26 : tensor<8x32xf32> | |
} | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice_3[%arg5] [32] [1] : tensor<64xf32> to tensor<32xf32> | |
%extracted_slice_8 = tensor.extract_slice %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<16x64xf32> to tensor<8x32xf32> | |
%17 = vector.transfer_read %16[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<8x32xf32>, vector<8x32xf32> | |
%18 = vector.transfer_read %extracted_slice_7[%c0], %cst_1 {in_bounds = [true]} : tensor<32xf32>, vector<32xf32> | |
%19 = vector.broadcast %18 : vector<32xf32> to vector<8x32xf32> | |
%20 = arith.addf %17, %19 : vector<8x32xf32> | |
%21 = vector.transfer_write %20, %extracted_slice_8[%c0, %c0] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<8x32xf32> | |
%inserted_slice = tensor.insert_slice %21 into %arg6[%arg3, %arg5] [8, 32] [1, 1] : tensor<8x32xf32> into tensor<16x64xf32> | |
scf.yield %inserted_slice : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<8x32xf32> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32> | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %cst_0) -> (vector<64x256xf32>) { | |
%12 = vector.transfer_read %5[%c0, %arg0], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<64x16xf32> | |
%13 = vector.transfer_read %6[%arg0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x256xf32> | |
%14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32> | |
scf.yield %14 : vector<64x256xf32> | |
} | |
%10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32> | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%14 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %cst) -> (vector<8x32xf32>) { | |
%20 = arith.addi %arg3, %arg0 : index | |
%21 = vector.transfer_read %5[%20, %arg7], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<8x16xf32> | |
%22 = arith.addi %arg5, %arg1 : index | |
%23 = vector.transfer_read %6[%arg7, %22], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x32xf32> | |
%24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %23, %arg8 : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32> | |
scf.yield %24 : vector<8x32xf32> | |
} | |
%15 = arith.addi %arg5, %arg1 : index | |
%16 = vector.transfer_read %7[%15], %cst_1 {in_bounds = [true]} : tensor<256xf32>, vector<32xf32> | |
%17 = vector.broadcast %16 : vector<32xf32> to vector<8x32xf32> | |
%18 = arith.addf %14, %17 : vector<8x32xf32> | |
%19 = vector.transfer_write %18, %arg6[%arg3, %arg5] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<16x64xf32> | |
scf.yield %19 : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<8x32xf32> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32> | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %cst_0) -> (vector<64x256xf32>) { | |
%12 = vector.transfer_read %5[%c0, %arg0], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<64x16xf32> | |
%13 = vector.transfer_read %6[%arg0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x256xf32> | |
%14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32> | |
scf.yield %14 : vector<64x256xf32> | |
} | |
%10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32> | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%14 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %cst) -> (vector<8x32xf32>) { | |
%20 = arith.addi %arg3, %arg0 : index | |
%21 = vector.transfer_read %5[%20, %arg7], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<8x16xf32> | |
%22 = arith.addi %arg5, %arg1 : index | |
%23 = vector.transfer_read %6[%arg7, %22], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x32xf32> | |
%24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %23, %arg8 : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32> | |
scf.yield %24 : vector<8x32xf32> | |
} | |
%15 = arith.addi %arg5, %arg1 : index | |
%16 = vector.transfer_read %7[%15], %cst_1 {in_bounds = [true]} : tensor<256xf32>, vector<32xf32> | |
%17 = vector.broadcast %16 : vector<32xf32> to vector<8x32xf32> | |
%18 = arith.addf %14, %17 : vector<8x32xf32> | |
%19 = vector.transfer_write %18, %arg6[%arg3, %arg5] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<16x64xf32> | |
scf.yield %19 : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<8x32xf32> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32> | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %cst_0) -> (vector<64x256xf32>) { | |
%12 = vector.transfer_read %5[%c0, %arg0], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<64x16xf32> | |
%13 = vector.transfer_read %6[%arg0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x256xf32> | |
%14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32> | |
scf.yield %14 : vector<64x256xf32> | |
} | |
%10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32> | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%14 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %cst) -> (vector<8x32xf32>) { | |
%20 = arith.addi %arg3, %arg0 : index | |
%21 = vector.transfer_read %5[%20, %arg7], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<8x16xf32> | |
%22 = arith.addi %arg5, %arg1 : index | |
%23 = vector.transfer_read %6[%arg7, %22], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x32xf32> | |
%24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %23, %arg8 : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32> | |
scf.yield %24 : vector<8x32xf32> | |
} | |
%15 = arith.addi %arg5, %arg1 : index | |
%16 = vector.transfer_read %7[%15], %cst_1 {in_bounds = [true]} : tensor<256xf32>, vector<32xf32> | |
%17 = vector.broadcast %16 : vector<32xf32> to vector<8x32xf32> | |
%18 = arith.addf %14, %17 : vector<8x32xf32> | |
%19 = vector.transfer_write %18, %arg6[%arg3, %arg5] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<16x64xf32> | |
scf.yield %19 : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
test.mlir:12:1: error: One or more operations with large vector sizes (32768 bytes) were found: | |
func.func @multi_result() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { | |
^ | |
<unknown>:0: note: %cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32> | |
<unknown>:0: note: %14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32> | |
test.mlir:27:9: note: scf.yield %14 : vector<64x256xf32> | |
%11 = linalg.matmul ins(%6, %7 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%10 : tensor<64x256xf32>) -> tensor<64x256xf32> | |
^ | |
test.mlir:27:9: note: | |
%9 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %cst_0) -> (vector<64x256xf32>) { | |
%12 = vector.transfer_read %5[%c0, %arg0], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<64x16xf32> | |
%13 = vector.transfer_read %6[%arg0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x256xf32> | |
%14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32> | |
scf.yield %14 : vector<64x256xf32> | |
} | |
<unknown>:0: note: %10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32> | |
// -----// IR Dump After LLVMCPUVerifyVectorSizeLegalityPass Failed (iree-llvmcpu-verify-vector-size-legality) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<8x32xf32> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32> | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %cst_0) -> (vector<64x256xf32>) { | |
%12 = vector.transfer_read %5[%c0, %arg0], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<64x16xf32> | |
%13 = vector.transfer_read %6[%arg0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x256xf32> | |
%14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32> | |
scf.yield %14 : vector<64x256xf32> | |
} | |
%10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32> | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%14 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %cst) -> (vector<8x32xf32>) { | |
%20 = arith.addi %arg3, %arg0 : index | |
%21 = vector.transfer_read %5[%20, %arg7], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<8x16xf32> | |
%22 = arith.addi %arg5, %arg1 : index | |
%23 = vector.transfer_read %6[%arg7, %22], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x32xf32> | |
%24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %23, %arg8 : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32> | |
scf.yield %24 : vector<8x32xf32> | |
} | |
%15 = arith.addi %arg5, %arg1 : index | |
%16 = vector.transfer_read %7[%15], %cst_1 {in_bounds = [true]} : tensor<256xf32>, vector<32xf32> | |
%17 = vector.broadcast %16 : vector<32xf32> to vector<8x32xf32> | |
%18 = arith.addf %14, %17 : vector<8x32xf32> | |
%19 = vector.transfer_write %18, %arg6[%arg3, %arg5] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<16x64xf32> | |
scf.yield %19 : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
// -----// IR Dump After LLVMCPULowerExecutableTargetPass Failed (iree-llvmcpu-lower-executable-target) //----- // | |
func.func @multi_result() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>, translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<8x32xf32> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x256xf32> | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c16 = arith.constant 16 : index | |
%c0 = arith.constant 0 : index | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32> | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32> | |
%7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32> | |
%8 = tensor.empty() : tensor<64x256xf32> | |
%9 = scf.for %arg0 = %c0 to %c128 step %c16 iter_args(%arg1 = %cst_0) -> (vector<64x256xf32>) { | |
%12 = vector.transfer_read %5[%c0, %arg0], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<64x16xf32> | |
%13 = vector.transfer_read %6[%arg0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x256xf32> | |
%14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg1 : vector<64x16xf32>, vector<16x256xf32> into vector<64x256xf32> | |
scf.yield %14 : vector<64x256xf32> | |
} | |
%10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<64x256xf32>, tensor<64x256xf32> | |
%11 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 256) step (16, 64) shared_outs(%arg2 = %8) -> (tensor<64x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<64x256xf32> to tensor<16x64xf32> | |
%12 = scf.for %arg3 = %c0 to %c16 step %c8 iter_args(%arg4 = %extracted_slice) -> (tensor<16x64xf32>) { | |
%13 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<16x64xf32>) { | |
%14 = scf.for %arg7 = %c0 to %c128 step %c16 iter_args(%arg8 = %cst) -> (vector<8x32xf32>) { | |
%20 = arith.addi %arg3, %arg0 : index | |
%21 = vector.transfer_read %5[%20, %arg7], %cst_1 {in_bounds = [true, true]} : tensor<64x128xf32>, vector<8x16xf32> | |
%22 = arith.addi %arg5, %arg1 : index | |
%23 = vector.transfer_read %6[%arg7, %22], %cst_1 {in_bounds = [true, true]} : tensor<128x256xf32>, vector<16x32xf32> | |
%24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %23, %arg8 : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32> | |
scf.yield %24 : vector<8x32xf32> | |
} | |
%15 = arith.addi %arg5, %arg1 : index | |
%16 = vector.transfer_read %7[%15], %cst_1 {in_bounds = [true]} : tensor<256xf32>, vector<32xf32> | |
%17 = vector.broadcast %16 : vector<32xf32> to vector<8x32xf32> | |
%18 = arith.addf %14, %17 : vector<8x32xf32> | |
%19 = vector.transfer_write %18, %arg6[%arg3, %arg5] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<16x64xf32> | |
scf.yield %19 : tensor<16x64xf32> | |
} | |
scf.yield %13 : tensor<16x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [16, 64] [1, 1] : tensor<16x64xf32> into tensor<64x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>> | |
return | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment