Created
September 25, 2024 15:19
-
-
Save pashu123/79ea5042c9415fdc0184bfd4ad0d76c9 to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- // | |
module { | |
func.func @fully_dynamic_pack_simple() { | |
%0 = flow.tensor.dynamic_constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> -> tensor<?x?xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%1 = util.unfoldable_constant 2 : index | |
%dim = tensor.dim %0, %c0 : tensor<?x?xi32> | |
%dim_0 = tensor.dim %0, %c1 : tensor<?x?xi32> | |
%2 = arith.ceildivui %dim, %1 : index | |
%3 = arith.ceildivui %dim_0, %1 : index | |
%4 = tensor.empty(%2, %3, %1, %1) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %0 inner_dims_pos = [0, 1] inner_tiles = [%1, %1] into %4 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq_const(%cast, dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>) : tensor<2x2x2x2xi32> | |
return | |
} | |
} | |
// -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- // | |
module { | |
util.func public @fully_dynamic_pack_simple() { | |
%0 = flow.tensor.dynamic_constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> -> tensor<?x?xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%1 = util.unfoldable_constant 2 : index | |
%dim = tensor.dim %0, %c0 : tensor<?x?xi32> | |
%dim_0 = tensor.dim %0, %c1 : tensor<?x?xi32> | |
%2 = arith.ceildivui %dim, %1 : index | |
%3 = arith.ceildivui %dim_0, %1 : index | |
%4 = tensor.empty(%2, %3, %1, %1) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %0 inner_dims_pos = [0, 1] inner_tiles = [%1, %1] into %4 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq_const(%cast, dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ImportMLProgramPass (iree-import-ml-program) //----- // | |
module { | |
util.func public @fully_dynamic_pack_simple() { | |
%0 = flow.tensor.dynamic_constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> -> tensor<?x?xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%1 = util.unfoldable_constant 2 : index | |
%dim = tensor.dim %0, %c0 : tensor<?x?xi32> | |
%dim_0 = tensor.dim %0, %c1 : tensor<?x?xi32> | |
%2 = arith.ceildivui %dim, %1 : index | |
%3 = arith.ceildivui %dim_0, %1 : index | |
%4 = tensor.empty(%2, %3, %1, %1) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %0 inner_dims_pos = [0, 1] inner_tiles = [%1, %1] into %4 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq_const(%cast, dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After SanitizeModuleNamesPass (iree-sanitize-module-names) //----- // | |
module { | |
util.func public @fully_dynamic_pack_simple() { | |
%0 = flow.tensor.dynamic_constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> -> tensor<?x?xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%1 = util.unfoldable_constant 2 : index | |
%dim = tensor.dim %0, %c0 : tensor<?x?xi32> | |
%dim_0 = tensor.dim %0, %c1 : tensor<?x?xi32> | |
%2 = arith.ceildivui %dim, %1 : index | |
%3 = arith.ceildivui %dim_0, %1 : index | |
%4 = tensor.empty(%2, %3, %1, %1) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %0 inner_dims_pos = [0, 1] inner_tiles = [%1, %1] into %4 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq_const(%cast, dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- // | |
module { | |
util.func public @fully_dynamic_pack_simple() { | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = flow.tensor.dynamic_constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> -> tensor<?x?xi32> | |
%1 = util.unfoldable_constant 2 : index | |
%dim = tensor.dim %0, %c0 : tensor<?x?xi32> | |
%dim_0 = tensor.dim %0, %c1 : tensor<?x?xi32> | |
%2 = arith.ceildivui %dim, %1 : index | |
%3 = arith.ceildivui %dim_0, %1 : index | |
%4 = tensor.empty(%2, %3, %1, %1) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %0 inner_dims_pos = [0, 1] inner_tiles = [%1, %1] into %4 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq_const(%cast, dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- // | |
module { | |
util.func public @fully_dynamic_pack_simple() { | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = flow.tensor.dynamic_constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> -> tensor<?x?xi32> | |
%1 = util.unfoldable_constant 2 : index | |
%dim = tensor.dim %0, %c0 : tensor<?x?xi32> | |
%dim_0 = tensor.dim %0, %c1 : tensor<?x?xi32> | |
%2 = arith.ceildivui %dim, %1 : index | |
%3 = arith.ceildivui %dim_0, %1 : index | |
%4 = tensor.empty(%2, %3, %1, %1) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %0 inner_dims_pos = [0, 1] inner_tiles = [%1, %1] into %4 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq_const(%cast, dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- // | |
module { | |
util.func public @fully_dynamic_pack_simple() { | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = flow.tensor.dynamic_constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> -> tensor<?x?xi32> | |
%1 = util.unfoldable_constant 2 : index | |
%dim = tensor.dim %0, %c0 : tensor<?x?xi32> | |
%dim_0 = tensor.dim %0, %c1 : tensor<?x?xi32> | |
%2 = arith.ceildivui %dim, %1 : index | |
%3 = arith.ceildivui %dim_0, %1 : index | |
%4 = tensor.empty(%2, %3, %1, %1) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %0 inner_dims_pos = [0, 1] inner_tiles = [%1, %1] into %4 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq_const(%cast, dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- // | |
module { | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = flow.tensor.dynamic_constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> -> tensor<?x?xi32> | |
%1 = util.unfoldable_constant 2 : index | |
%dim = tensor.dim %0, %c0 : tensor<?x?xi32> | |
%dim_0 = tensor.dim %0, %c1 : tensor<?x?xi32> | |
%2 = arith.ceildivui %dim, %1 : index | |
%3 = arith.ceildivui %dim_0, %1 : index | |
%4 = tensor.empty(%2, %3, %1, %1) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %0 inner_dims_pos = [0, 1] inner_tiles = [%1, %1] into %4 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq_const(%cast, dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq(%cast, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Inliner (inline) //----- // | |
module { | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq(%cast, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq(%cast, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq(%cast, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
module { | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq(%cast, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {hal.device.targets = [#device_target_local]} { | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq(%cast, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq(%cast, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq(%cast, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq(%cast, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq(%cast, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq(%cast, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%cast = tensor.cast %pack : tensor<?x?x?x?xi32> to tensor<2x2x2x2xi32> | |
check.expect_eq(%cast, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Convert1X1FilterConv2DToMatmulPass (iree-global-opt-convert-1x1-filter-conv2d-to-matmul) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After Convert1X1FilterConv2DToMatmulPass (iree-global-opt-convert-1x1-filter-conv2d-to-matmul) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SetEncodingPass (iree-dispatch-creation-set-encoding) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SetEncodingPass (iree-dispatch-creation-set-encoding) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CPUMaterializeHostEncodingPass (iree-codegen-cpu-materialize-host-encoding) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After MaterializeHomogeneousEncodingsPass (iree-global-opt-materialize-homogeneous-encodings) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After SimplifyPackUnpackPass (iree-global-opt-simplify-pack-unpack) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After DataLayoutPropagationPass (iree-global-opt-data-layout-propagation) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After DataLayoutPropagationPass (iree-global-opt-data-layout-propagation) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After HoistIntoGlobals (iree-util-hoist-into-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After JitGlobalsPass (iree-consteval-jit-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
%8 = flow.tensor.reshape %pack : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c2_0 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%8 = flow.dispatch.region -> (tensor<?x?x?x?xi32>{%5, %6, %4, %4}) { | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %7 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.return %pack : tensor<?x?x?x?xi32> | |
} | |
%9 = flow.tensor.reshape %8 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%9, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c2_0 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%8 = flow.dispatch.region -> (tensor<?x?x?x?xi32>{%5, %6, %4, %4}) { | |
%10 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %10 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.return %pack : tensor<?x?x?x?xi32> | |
} | |
%9 = flow.tensor.reshape %8 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%9, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c2_0 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%8 = flow.dispatch.region -> (tensor<?x?x?x?xi32>{%5, %6, %4, %4}) { | |
%10 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %3 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %10 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.return %pack : tensor<?x?x?x?xi32> | |
} | |
%9 = flow.tensor.reshape %8 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%9, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = tensor.empty(%5, %6, %4, %4) : tensor<?x?x?x?xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c2_0 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%8 = flow.dispatch.workgroups(%5, %6, %4, %3, %1, %2, %5, %6, %4, %4) : (index, index, index, tensor<?x?xi32>{%1, %2}, index, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%5, %6, %4, %4} = | |
(%arg0: index, %arg1: index, %arg2: index, %arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%10 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%arg4, %arg5], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg4, %arg5} -> tensor<?x?xi32> | |
%11 = tensor.empty(%arg6, %arg7, %arg9, %arg9) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %10 inner_dims_pos = [0, 1] inner_tiles = [%arg9, %arg9] into %11 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %arg10, offsets = [0, 0, 0, 0], sizes = [%arg6, %arg7, %arg9, %arg9], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%arg6, %arg7, %arg9, %arg9} | |
flow.return | |
} | |
%9 = flow.tensor.reshape %8 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%9, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = flow.dispatch.workgroups(%5, %6, %4, %3, %1, %2, %5, %6, %4, %4) : (index, index, index, tensor<?x?xi32>{%1, %2}, index, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%5, %6, %4, %4} = | |
(%arg0: index, %arg1: index, %arg2: index, %arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%arg4, %arg5], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg4, %arg5} -> tensor<?x?xi32> | |
%10 = tensor.empty(%arg6, %arg7, %arg9, %arg9) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %9 inner_dims_pos = [0, 1] inner_tiles = [%arg9, %arg9] into %10 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %arg10, offsets = [0, 0, 0, 0], sizes = [%arg6, %arg7, %arg9, %arg9], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%arg6, %arg7, %arg9, %arg9} | |
flow.return | |
} | |
%8 = flow.tensor.reshape %7 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = flow.dispatch.workgroups(%3, %1, %2, %5, %6, %4) : (tensor<?x?xi32>{%1, %2}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%5, %6, %4, %4} = | |
(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%9 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [%arg1, %arg2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg1, %arg2} -> tensor<?x?xi32> | |
%10 = tensor.empty(%arg3, %arg4, %arg5, %arg5) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %9 inner_dims_pos = [0, 1] inner_tiles = [%arg5, %arg5] into %10 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %arg6, offsets = [0, 0, 0, 0], sizes = [%arg3, %arg4, %arg5, %arg5], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%arg3, %arg4, %arg5, %arg5} | |
flow.return | |
} | |
%8 = flow.tensor.reshape %7 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = flow.dispatch.workgroups[%1, %2, %5, %6, %4](%3, %1, %2, %5, %6, %4) : (tensor<?x?xi32>{%1, %2}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%5, %6, %4, %4} = | |
(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%9 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%10 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%11 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%12 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%13 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%14 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [%9, %10], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%9, %10} -> tensor<?x?xi32> | |
%15 = tensor.empty(%11, %12, %13, %13) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %14 inner_dims_pos = [0, 1] inner_tiles = [%13, %13] into %15 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %arg6, offsets = [0, 0, 0, 0], sizes = [%11, %12, %13, %13], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%11, %12, %13, %13} | |
flow.return | |
} count(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%8 = flow.tensor.reshape %7 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After VerifyInputLegalityPass (iree-verify-input-legality) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = flow.dispatch.workgroups[%1, %2, %5, %6, %4](%3, %1, %2, %5, %6, %4) : (tensor<?x?xi32>{%1, %2}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%5, %6, %4, %4} = | |
(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%9 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%10 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%11 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%12 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%13 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%14 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [%9, %10], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%9, %10} -> tensor<?x?xi32> | |
%15 = tensor.empty(%11, %12, %13, %13) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %14 inner_dims_pos = [0, 1] inner_tiles = [%13, %13] into %15 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %arg6, offsets = [0, 0, 0, 0], sizes = [%11, %12, %13, %13], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%11, %12, %13, %13} | |
flow.return | |
} count(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%8 = flow.tensor.reshape %7 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = flow.dispatch.workgroups[%1, %2, %5, %6, %4](%3, %1, %2, %5, %6, %4) : (tensor<?x?xi32>{%1, %2}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%5, %6, %4, %4} = | |
(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%9 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg1, %arg2} | |
%10 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%arg3, %arg4, %arg5, %arg5} | |
%11 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%12 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%13 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%14 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%15 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%16 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [%11, %12], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%11, %12} -> tensor<?x?xi32> | |
%17 = tensor.empty(%13, %14, %15, %15) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %16 inner_dims_pos = [0, 1] inner_tiles = [%15, %15] into %17 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %10, offsets = [0, 0, 0, 0], sizes = [%13, %14, %15, %15], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%13, %14, %15, %15} | |
flow.return | |
} count(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%8 = flow.tensor.reshape %7 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = flow.dispatch.workgroups[%1, %2, %5, %6, %4](%3, %1, %2, %5, %6, %4) : (tensor<?x?xi32>{%1, %2}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%5, %6, %4, %4} = | |
(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%9 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%10 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%11 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%12 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%13 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%14 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%9, %10} | |
%15 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%11, %12, %13, %13} | |
%16 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [%9, %10], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%9, %10} -> tensor<?x?xi32> | |
%17 = tensor.empty(%11, %12, %13, %13) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %16 inner_dims_pos = [0, 1] inner_tiles = [%13, %13] into %17 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %15, offsets = [0, 0, 0, 0], sizes = [%11, %12, %13, %13], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%11, %12, %13, %13} | |
flow.return | |
} count(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%8 = flow.tensor.reshape %7 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = flow.dispatch.workgroups[%1, %2, %5, %6, %4](%3, %1, %2, %5, %6, %4) : (tensor<?x?xi32>{%1, %2}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%5, %6, %4, %4} = | |
(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%9 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%10 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%11 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%12 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%13 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%14 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%9, %10} | |
%15 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%11, %12, %13, %13} | |
%16 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [%9, %10], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%9, %10} -> tensor<?x?xi32> | |
%17 = tensor.empty(%11, %12, %13, %13) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %16 inner_dims_pos = [0, 1] inner_tiles = [%13, %13] into %17 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %15, offsets = [0, 0, 0, 0], sizes = [%11, %12, %13, %13], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%11, %12, %13, %13} | |
flow.return | |
} count(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%8 = flow.tensor.reshape %7 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = flow.dispatch.workgroups[%1, %2, %5, %6, %4](%3, %1, %2, %5, %6, %4) : (tensor<?x?xi32>{%1, %2}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%5, %6, %4, %4} = | |
(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%9 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%10 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%11 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%12 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%13 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%14 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%9, %10} | |
%15 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%11, %12, %13, %13} | |
%16 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [%9, %10], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%9, %10} -> tensor<?x?xi32> | |
%17 = tensor.empty(%11, %12, %13, %13) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %16 inner_dims_pos = [0, 1] inner_tiles = [%13, %13] into %17 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %15, offsets = [0, 0, 0, 0], sizes = [%11, %12, %13, %13], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%11, %12, %13, %13} | |
flow.return | |
} count(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%8 = flow.tensor.reshape %7 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After OutlineDispatchExternsPass (iree-flow-outline-dispatch-externs) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = flow.dispatch.workgroups[%1, %2, %5, %6, %4](%3, %1, %2, %5, %6, %4) : (tensor<?x?xi32>{%1, %2}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%5, %6, %4, %4} = | |
(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%9 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%10 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%11 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%12 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%13 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%14 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%9, %10} | |
%15 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%11, %12, %13, %13} | |
%16 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [%9, %10], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%9, %10} -> tensor<?x?xi32> | |
%17 = tensor.empty(%11, %12, %13, %13) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %16 inner_dims_pos = [0, 1] inner_tiles = [%13, %13] into %17 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %15, offsets = [0, 0, 0, 0], sizes = [%11, %12, %13, %13], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%11, %12, %13, %13} | |
flow.return | |
} count(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%8 = flow.tensor.reshape %7 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After OutlineDispatchRegionsPass (iree-flow-outline-dispatch-regions) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
flow.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
flow.executable.export public @_fully_dynamic_pack_simple_dispatch_0 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0[%1, %2, %5, %6, %4](%3, %1, %2, %5, %6, %4) : (tensor<?x?xi32>{%1, %2}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%5, %6, %4, %4} | |
%8 = flow.tensor.reshape %7 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After AnnotateDispatchesPass (iree-flow-annotate-dispatches) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
flow.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
flow.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %5, %6, %4](%3, %1, %2, %5, %6, %4) : (tensor<?x?xi32>{%1, %2}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%5, %6, %4, %4} | |
%8 = flow.tensor.reshape %7 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- // | |
flow.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
flow.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %5, %6, %4](%3, %1, %2, %5, %6, %4) : (tensor<?x?xi32>{%1, %2}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%5, %6, %4, %4} | |
%8 = flow.tensor.reshape %7 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After DeduplicateExecutablesPass (iree-flow-deduplicate-executables) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
flow.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
flow.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %5, %6, %4](%3, %1, %2, %5, %6, %4) : (tensor<?x?xi32>{%1, %2}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%5, %6, %4, %4} | |
%8 = flow.tensor.reshape %7 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CleanupTensorShapesPass (iree-flow-cleanup-tensor-shapes) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %5, %6, %4](%3, %1, %2, %5, %6, %4) : (tensor<?x?xi32>{%1, %2}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%5, %6, %4, %4} | |
%8 = flow.tensor.reshape %7 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CleanupTensorShapesPass (iree-flow-cleanup-tensor-shapes) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%cst = arith.constant dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%0 = flow.tensor.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = flow.tensor.reshape %0 : tensor<4x4xi32> -> tensor<?x?xi32>{%1, %2} | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %5, %6, %4](%3, %1, %2, %5, %6, %4) : (tensor<?x?xi32>{%1, %2}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%5, %6, %4, %4} | |
%8 = flow.tensor.reshape %7 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%8, %cst) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After OutlineConstantsPass (iree-flow-outline-constants) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
flow.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
flow.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
util.global private @__constant_tensor_4x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.func private @_fully_dynamic_pack_simple() { | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : tensor<2x2x2x2xi32> | |
%c2 = arith.constant 2 : index | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = flow.tensor.reshape %__constant_tensor_4x4xi32 : tensor<4x4xi32> -> tensor<?x?xi32>{%0, %1} | |
%3 = util.optimization_barrier %c2 : index | |
%4 = arith.ceildivui %0, %3 : index | |
%5 = arith.ceildivui %1, %3 : index | |
%6 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %4, %5, %3](%2, %0, %1, %4, %5, %3) : (tensor<?x?xi32>{%0, %1}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%4, %5, %3, %3} | |
%7 = flow.tensor.reshape %6 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%7, %__constant_tensor_2x2x2x2xi32) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : tensor<2x2x2x2xi32> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : tensor<4x4xi32> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = flow.tensor.reshape %__constant_tensor_4x4xi32 : tensor<4x4xi32> -> tensor<?x?xi32>{%0, %1} | |
%3 = util.optimization_barrier %c2 : index | |
%4 = arith.ceildivui %0, %3 : index | |
%5 = arith.ceildivui %1, %3 : index | |
%6 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %4, %5, %3](%2, %0, %1, %4, %5, %3) : (tensor<?x?xi32>{%0, %1}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%4, %5, %3, %3} | |
%7 = flow.tensor.reshape %6 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%7, %__constant_tensor_2x2x2x2xi32) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : tensor<2x2x2x2xi32> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : tensor<4x4xi32> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = flow.tensor.reshape %__constant_tensor_4x4xi32 : tensor<4x4xi32> -> tensor<?x?xi32>{%0, %1} | |
%3 = util.optimization_barrier %c2 : index | |
%4 = arith.ceildivui %0, %3 : index | |
%5 = arith.ceildivui %1, %3 : index | |
%6 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %4, %5, %3](%2, %0, %1, %4, %5, %3) : (tensor<?x?xi32>{%0, %1}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%4, %5, %3, %3} | |
%7 = flow.tensor.reshape %6 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%7, %__constant_tensor_2x2x2x2xi32) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : tensor<2x2x2x2xi32> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : tensor<4x4xi32> | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = flow.tensor.reshape %__constant_tensor_4x4xi32 : tensor<4x4xi32> -> tensor<?x?xi32>{%0, %1} | |
%3 = util.optimization_barrier %c2 : index | |
%4 = arith.ceildivui %0, %3 : index | |
%5 = arith.ceildivui %1, %3 : index | |
%6 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %4, %5, %3](%2, %0, %1, %4, %5, %3) : (tensor<?x?xi32>{%0, %1}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%4, %5, %3, %3} | |
%7 = flow.tensor.reshape %6 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%7, %__constant_tensor_2x2x2x2xi32) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
flow.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
flow.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
util.global private @__constant_tensor_4x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : tensor<2x2x2x2xi32> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : tensor<4x4xi32> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = flow.tensor.reshape %__constant_tensor_4x4xi32 : tensor<4x4xi32> -> tensor<?x?xi32>{%0, %1} | |
%3 = util.optimization_barrier %c2 : index | |
%4 = arith.ceildivui %0, %3 : index | |
%5 = arith.ceildivui %1, %3 : index | |
%6 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %4, %5, %3](%2, %0, %1, %4, %5, %3) : (tensor<?x?xi32>{%0, %1}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%4, %5, %3, %3} | |
%7 = flow.tensor.reshape %6 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%7, %__constant_tensor_2x2x2x2xi32) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
flow.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
flow.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
util.global private @__constant_tensor_4x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : tensor<2x2x2x2xi32> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : tensor<4x4xi32> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = flow.tensor.reshape %__constant_tensor_4x4xi32 : tensor<4x4xi32> -> tensor<?x?xi32>{%0, %1} | |
%3 = util.optimization_barrier %c2 : index | |
%4 = arith.ceildivui %0, %3 : index | |
%5 = arith.ceildivui %1, %3 : index | |
%6 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %4, %5, %3](%2, %0, %1, %4, %5, %3) : (tensor<?x?xi32>{%0, %1}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%4, %5, %3, %3} | |
%7 = flow.tensor.reshape %6 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%7, %__constant_tensor_2x2x2x2xi32) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
flow.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
flow.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
util.global private @__constant_tensor_4x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : tensor<2x2x2x2xi32> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : tensor<4x4xi32> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = flow.tensor.reshape %__constant_tensor_4x4xi32 : tensor<4x4xi32> -> tensor<?x?xi32>{%0, %1} | |
%3 = util.optimization_barrier %c2 : index | |
%4 = arith.ceildivui %0, %3 : index | |
%5 = arith.ceildivui %1, %3 : index | |
%6 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %4, %5, %3](%2, %0, %1, %4, %5, %3) : (tensor<?x?xi32>{%0, %1}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%4, %5, %3, %3} | |
%7 = flow.tensor.reshape %6 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%7, %__constant_tensor_2x2x2x2xi32) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
flow.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
flow.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
util.global private @__constant_tensor_4x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : tensor<2x2x2x2xi32> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : tensor<4x4xi32> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = flow.tensor.reshape %__constant_tensor_4x4xi32 : tensor<4x4xi32> -> tensor<?x?xi32>{%0, %1} | |
%3 = util.optimization_barrier %c2 : index | |
%4 = arith.ceildivui %0, %3 : index | |
%5 = arith.ceildivui %1, %3 : index | |
%6 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %4, %5, %3](%2, %0, %1, %4, %5, %3) : (tensor<?x?xi32>{%0, %1}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%4, %5, %3, %3} | |
%7 = flow.tensor.reshape %6 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%7, %__constant_tensor_2x2x2x2xi32) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
flow.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
flow.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
util.global private @__constant_tensor_4x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : tensor<2x2x2x2xi32> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : tensor<4x4xi32> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = flow.tensor.reshape %__constant_tensor_4x4xi32 : tensor<4x4xi32> -> tensor<?x?xi32>{%0, %1} | |
%3 = util.optimization_barrier %c2 : index | |
%4 = arith.ceildivui %0, %3 : index | |
%5 = arith.ceildivui %1, %3 : index | |
%6 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %4, %5, %3](%2, %0, %1, %4, %5, %3) : (tensor<?x?xi32>{%0, %1}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%4, %5, %3, %3} | |
%7 = flow.tensor.reshape %6 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%7, %__constant_tensor_2x2x2x2xi32) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
flow.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
flow.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
util.global private @__constant_tensor_4x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : tensor<2x2x2x2xi32> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : tensor<4x4xi32> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = flow.tensor.reshape %__constant_tensor_4x4xi32 : tensor<4x4xi32> -> tensor<?x?xi32>{%0, %1} | |
%3 = util.optimization_barrier %c2 : index | |
%4 = arith.ceildivui %0, %3 : index | |
%5 = arith.ceildivui %1, %3 : index | |
%6 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %4, %5, %3](%2, %0, %1, %4, %5, %3) : (tensor<?x?xi32>{%0, %1}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%4, %5, %3, %3} | |
%7 = flow.tensor.reshape %6 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%7, %__constant_tensor_2x2x2x2xi32) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After VerifyInputPass (iree-stream-verify-input) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
flow.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
flow.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
util.global private @__constant_tensor_4x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : tensor<2x2x2x2xi32> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : tensor<4x4xi32> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = flow.tensor.reshape %__constant_tensor_4x4xi32 : tensor<4x4xi32> -> tensor<?x?xi32>{%0, %1} | |
%3 = util.optimization_barrier %c2 : index | |
%4 = arith.ceildivui %0, %3 : index | |
%5 = arith.ceildivui %1, %3 : index | |
%6 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %4, %5, %3](%2, %0, %1, %4, %5, %3) : (tensor<?x?xi32>{%0, %1}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%4, %5, %3, %3} | |
%7 = flow.tensor.reshape %6 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%7, %__constant_tensor_2x2x2x2xi32) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : tensor<2x2x2x2xi32> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : tensor<4x4xi32> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = flow.tensor.reshape %__constant_tensor_4x4xi32 : tensor<4x4xi32> -> tensor<?x?xi32>{%0, %1} | |
%3 = util.optimization_barrier %c2 : index | |
%4 = arith.ceildivui %0, %3 : index | |
%5 = arith.ceildivui %1, %3 : index | |
%6 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %4, %5, %3](%2, %0, %1, %4, %5, %3) : (tensor<?x?xi32>{%0, %1}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%4, %5, %3, %3} | |
%7 = flow.tensor.reshape %6 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%7, %__constant_tensor_2x2x2x2xi32) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : tensor<2x2x2x2xi32> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : tensor<4x4xi32> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = flow.tensor.reshape %__constant_tensor_4x4xi32 : tensor<4x4xi32> -> tensor<?x?xi32>{%0, %1} | |
%3 = util.optimization_barrier %c2 : index | |
%4 = arith.ceildivui %0, %3 : index | |
%5 = arith.ceildivui %1, %3 : index | |
%6 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %4, %5, %3](%2, %0, %1, %4, %5, %3) : (tensor<?x?xi32>{%0, %1}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%4, %5, %3, %3} | |
%7 = flow.tensor.reshape %6 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%7, %__constant_tensor_2x2x2x2xi32) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : tensor<2x2x2x2xi32> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : tensor<4x4xi32> | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = flow.tensor.reshape %__constant_tensor_4x4xi32 : tensor<4x4xi32> -> tensor<?x?xi32>{%0, %1} | |
%3 = util.optimization_barrier %c2 : index | |
%4 = arith.ceildivui %0, %3 : index | |
%5 = arith.ceildivui %1, %3 : index | |
%6 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %4, %5, %3](%2, %0, %1, %4, %5, %3) : (tensor<?x?xi32>{%0, %1}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%4, %5, %3, %3} | |
%7 = flow.tensor.reshape %6 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%7, %__constant_tensor_2x2x2x2xi32) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
flow.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
flow.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
util.global private @__constant_tensor_4x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : tensor<2x2x2x2xi32> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : tensor<4x4xi32> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = flow.tensor.reshape %__constant_tensor_4x4xi32 : tensor<4x4xi32> -> tensor<?x?xi32>{%0, %1} | |
%3 = util.optimization_barrier %c2 : index | |
%4 = arith.ceildivui %0, %3 : index | |
%5 = arith.ceildivui %1, %3 : index | |
%6 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %4, %5, %3](%2, %0, %1, %4, %5, %3) : (tensor<?x?xi32>{%0, %1}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%4, %5, %3, %3} | |
%7 = flow.tensor.reshape %6 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%7, %__constant_tensor_2x2x2x2xi32) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
flow.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
flow.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
util.global private @__constant_tensor_4x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : tensor<2x2x2x2xi32> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : tensor<4x4xi32> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = flow.tensor.reshape %__constant_tensor_4x4xi32 : tensor<4x4xi32> -> tensor<?x?xi32>{%0, %1} | |
%3 = util.optimization_barrier %c2 : index | |
%4 = arith.ceildivui %0, %3 : index | |
%5 = arith.ceildivui %1, %3 : index | |
%6 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %4, %5, %3](%2, %0, %1, %4, %5, %3) : (tensor<?x?xi32>{%0, %1}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%4, %5, %3, %3} | |
%7 = flow.tensor.reshape %6 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%7, %__constant_tensor_2x2x2x2xi32) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
flow.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
flow.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
util.global private @__constant_tensor_4x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : tensor<2x2x2x2xi32> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : tensor<4x4xi32> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = flow.tensor.reshape %__constant_tensor_4x4xi32 : tensor<4x4xi32> -> tensor<?x?xi32>{%0, %1} | |
%3 = util.optimization_barrier %c2 : index | |
%4 = arith.ceildivui %0, %3 : index | |
%5 = arith.ceildivui %1, %3 : index | |
%6 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %4, %5, %3](%2, %0, %1, %4, %5, %3) : (tensor<?x?xi32>{%0, %1}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%4, %5, %3, %3} | |
%7 = flow.tensor.reshape %6 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%7, %__constant_tensor_2x2x2x2xi32) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
flow.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
flow.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
util.global private @__constant_tensor_4x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : tensor<2x2x2x2xi32> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : tensor<4x4xi32> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = flow.tensor.reshape %__constant_tensor_4x4xi32 : tensor<4x4xi32> -> tensor<?x?xi32>{%0, %1} | |
%3 = util.optimization_barrier %c2 : index | |
%4 = arith.ceildivui %0, %3 : index | |
%5 = arith.ceildivui %1, %3 : index | |
%6 = flow.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %4, %5, %3](%2, %0, %1, %4, %5, %3) : (tensor<?x?xi32>{%0, %1}, index, index, index, index, index) -> tensor<?x?x?x?xi32>{%4, %5, %3, %3} | |
%7 = flow.tensor.reshape %6 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%7, %__constant_tensor_2x2x2x2xi32) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ConvertToStreamPass (iree-stream-conversion) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_2x2x2x2xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2x2x2xi32> in !stream.resource<constant> = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_2x2x2x2xi32__size : index | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_4x4xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<4x4xi32> in !stream.resource<constant> = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_4x4xi32__size : index | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__size = util.global.load @__constant_tensor_2x2x2x2xi32__size : index | |
%0 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_2x2x2x2xi32__size} | |
%__constant_tensor_4x4xi32 = util.global.load @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__size = util.global.load @__constant_tensor_4x4xi32__size : index | |
%1 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%__constant_tensor_4x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_4x4xi32__size} | |
%2 = util.optimization_barrier %c4 : index | |
%3 = util.optimization_barrier %c4 : index | |
%4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?xi32>{%2, %3} : index | |
%5 = stream.tensor.clone on(#hal.device.affinity<@__device_0>) %1 : tensor<4x4xi32> in !stream.resource<*>{%__constant_tensor_4x4xi32__size} -> tensor<?x?xi32>{%2, %3} in !stream.resource<*>{%4} | |
%6 = util.optimization_barrier %c2 : index | |
%7 = arith.ceildivui %2, %6 : index | |
%8 = arith.ceildivui %3, %6 : index | |
%c0 = arith.constant 0 : index | |
%9 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xi32>{%7, %8, %6, %6} : index | |
%10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%2, %3, %7, %8, %6](%5[%c0 to %4 for %4], %2, %3, %7, %8, %6) : (!stream.resource<*>{%4}, index, index, index, index, index) -> !stream.resource<*>{%9} | |
%11 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2x2x2xi32> : index | |
%12 = stream.tensor.clone on(#hal.device.affinity<@__device_0>) %10 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} in !stream.resource<*>{%9} -> tensor<2x2x2x2xi32> in !stream.resource<*>{%11} | |
%13 = stream.async.transfer %12 : !stream.resource<*>{%11} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%11} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%11} -> tensor<2x2x2x2xi32> | |
%15 = stream.async.transfer %0 : !stream.resource<*>{%__constant_tensor_2x2x2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} | |
%16 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %16) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_2x2x2x2xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2x2x2xi32> in !stream.resource<constant> = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_2x2x2x2xi32__size : index | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_4x4xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<4x4xi32> in !stream.resource<constant> = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_4x4xi32__size : index | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__size = util.global.load @__constant_tensor_2x2x2x2xi32__size : index | |
%0 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_2x2x2x2xi32__size} | |
%__constant_tensor_4x4xi32 = util.global.load @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__size = util.global.load @__constant_tensor_4x4xi32__size : index | |
%1 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%__constant_tensor_4x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_4x4xi32__size} | |
%2 = util.optimization_barrier %c4 : index | |
%3 = util.optimization_barrier %c4 : index | |
%4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?xi32>{%2, %3} : index | |
%5 = stream.tensor.clone on(#hal.device.affinity<@__device_0>) %1 : tensor<4x4xi32> in !stream.resource<*>{%__constant_tensor_4x4xi32__size} -> tensor<?x?xi32>{%2, %3} in !stream.resource<*>{%4} | |
%6 = util.optimization_barrier %c2 : index | |
%7 = arith.ceildivui %2, %6 : index | |
%8 = arith.ceildivui %3, %6 : index | |
%c0 = arith.constant 0 : index | |
%9 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xi32>{%7, %8, %6, %6} : index | |
%10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%2, %3, %7, %8, %6](%5[%c0 to %4 for %4], %2, %3, %7, %8, %6) : (!stream.resource<*>{%4}, index, index, index, index, index) -> !stream.resource<*>{%9} | |
%11 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2x2x2xi32> : index | |
%12 = stream.tensor.clone on(#hal.device.affinity<@__device_0>) %10 : tensor<?x?x?x?xi32>{%c2, %c2, %c2, %c2} in !stream.resource<*>{%9} -> tensor<2x2x2x2xi32> in !stream.resource<*>{%11} | |
%13 = stream.async.transfer %12 : !stream.resource<*>{%11} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%11} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%11} -> tensor<2x2x2x2xi32> | |
%15 = stream.async.transfer %0 : !stream.resource<*>{%__constant_tensor_2x2x2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} | |
%16 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %16) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__size = util.global.load @__constant_tensor_2x2x2x2xi32__size : index | |
%__constant_tensor_4x4xi32 = util.global.load @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__size = util.global.load @__constant_tensor_4x4xi32__size : index | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%__constant_tensor_4x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_4x4xi32__size} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?xi32>{%1, %2} : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xi32>{%5, %6, %4, %4} : index | |
%8 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %5, %6, %4](%0[%c0 to %3 for %3], %1, %2, %5, %6, %4) : (!stream.resource<*>{%3}, index, index, index, index, index) -> !stream.resource<*>{%7} | |
%9 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2x2x2xi32> : index | |
%10 = stream.async.transfer %8 : !stream.resource<*>{%9} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%9} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2x2x2xi32> in !stream.resource<external>{%9} -> tensor<2x2x2x2xi32> | |
%12 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2x2x2xi32> in !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%11, %13) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2x2x2xi32> in !stream.resource<constant> = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_2x2x2x2xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<4x4xi32> in !stream.resource<constant> = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_4x4xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After Inliner (inline) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_2x2x2x2xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2x2x2xi32> in !stream.resource<constant> = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_2x2x2x2xi32__size : index | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_4x4xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<4x4xi32> in !stream.resource<constant> = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_4x4xi32__size : index | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__size = util.global.load @__constant_tensor_2x2x2x2xi32__size : index | |
%__constant_tensor_4x4xi32 = util.global.load @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__size = util.global.load @__constant_tensor_4x4xi32__size : index | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%__constant_tensor_4x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_4x4xi32__size} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?xi32>{%1, %2} : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xi32>{%5, %6, %4, %4} : index | |
%8 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %5, %6, %4](%0[%c0 to %3 for %3], %1, %2, %5, %6, %4) : (!stream.resource<*>{%3}, index, index, index, index, index) -> !stream.resource<*>{%7} | |
%9 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2x2x2xi32> : index | |
%10 = stream.async.transfer %8 : !stream.resource<*>{%9} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%9} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2x2x2xi32> in !stream.resource<external>{%9} -> tensor<2x2x2x2xi32> | |
%12 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2x2x2xi32> in !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%11, %13) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2x2x2xi32> in !stream.resource<constant> = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_2x2x2x2xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2x2x2xi32> in !stream.resource<constant> = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_2x2x2x2xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<4x4xi32> in !stream.resource<constant> = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_4x4xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__size = util.global.load @__constant_tensor_2x2x2x2xi32__size : index | |
%__constant_tensor_4x4xi32 = util.global.load @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__size = util.global.load @__constant_tensor_4x4xi32__size : index | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%__constant_tensor_4x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_4x4xi32__size} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?xi32>{%1, %2} : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xi32>{%5, %6, %4, %4} : index | |
%8 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %5, %6, %4](%0[%c0 to %3 for %3], %1, %2, %5, %6, %4) : (!stream.resource<*>{%3}, index, index, index, index, index) -> !stream.resource<*>{%7} | |
%9 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2x2x2xi32> : index | |
%10 = stream.async.transfer %8 : !stream.resource<*>{%9} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%9} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2x2x2xi32> in !stream.resource<external>{%9} -> tensor<2x2x2x2xi32> | |
%12 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2x2x2xi32> in !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%11, %13) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__size = util.global.load @__constant_tensor_2x2x2x2xi32__size : index | |
%__constant_tensor_4x4xi32 = util.global.load @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__size = util.global.load @__constant_tensor_4x4xi32__size : index | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%__constant_tensor_4x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_4x4xi32__size} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?xi32>{%1, %2} : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xi32>{%5, %6, %4, %4} : index | |
%8 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %5, %6, %4](%0[%c0 to %3 for %3], %1, %2, %5, %6, %4) : (!stream.resource<*>{%3}, index, index, index, index, index) -> !stream.resource<*>{%7} | |
%9 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2x2x2xi32> : index | |
%10 = stream.async.transfer %8 : !stream.resource<*>{%9} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%9} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2x2x2xi32> in !stream.resource<external>{%9} -> tensor<2x2x2x2xi32> | |
%12 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2x2x2xi32> in !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%11, %13) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<4x4xi32> in !stream.resource<constant> = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_4x4xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<4x4xi32> in !stream.resource<constant> = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_4x4xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%__constant_tensor_2x2x2x2xi32 = util.global.load @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__size = util.global.load @__constant_tensor_2x2x2x2xi32__size : index | |
%__constant_tensor_4x4xi32 = util.global.load @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__size = util.global.load @__constant_tensor_4x4xi32__size : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%__constant_tensor_4x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_4x4xi32__size} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?xi32>{%1, %2} : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xi32>{%5, %6, %4, %4} : index | |
%8 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %5, %6, %4](%0[%c0 to %3 for %3], %1, %2, %5, %6, %4) : (!stream.resource<*>{%3}, index, index, index, index, index) -> !stream.resource<*>{%7} | |
%9 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2x2x2xi32> : index | |
%10 = stream.async.transfer %8 : !stream.resource<*>{%9} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%9} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2x2x2xi32> in !stream.resource<external>{%9} -> tensor<2x2x2x2xi32> | |
%12 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2x2x2xi32> in !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%11, %13) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2x2x2xi32> in !stream.resource<constant> = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_2x2x2x2xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_2x2x2x2xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2x2x2xi32> in !stream.resource<constant> = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_2x2x2x2xi32__size : index | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_4x4xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<4x4xi32> in !stream.resource<constant> = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_4x4xi32__size : index | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__size = util.global.load @__constant_tensor_2x2x2x2xi32__size : index | |
%__constant_tensor_4x4xi32 = util.global.load @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__size = util.global.load @__constant_tensor_4x4xi32__size : index | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%__constant_tensor_4x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_4x4xi32__size} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?xi32>{%1, %2} : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xi32>{%5, %6, %4, %4} : index | |
%8 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %5, %6, %4](%0[%c0 to %3 for %3], %1, %2, %5, %6, %4) : (!stream.resource<*>{%3}, index, index, index, index, index) -> !stream.resource<*>{%7} | |
%9 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2x2x2xi32> : index | |
%10 = stream.async.transfer %8 : !stream.resource<*>{%9} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%9} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2x2x2xi32> in !stream.resource<external>{%9} -> tensor<2x2x2x2xi32> | |
%12 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2x2x2xi32> in !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%11, %13) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_2x2x2x2xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2x2x2xi32> in !stream.resource<constant> = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_2x2x2x2xi32__size : index | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_4x4xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<4x4xi32> in !stream.resource<constant> = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_4x4xi32__size : index | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__size = util.global.load immutable @__constant_tensor_2x2x2x2xi32__size : index | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__size = util.global.load immutable @__constant_tensor_4x4xi32__size : index | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%__constant_tensor_4x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_4x4xi32__size} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?xi32>{%1, %2} : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xi32>{%5, %6, %4, %4} : index | |
%8 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %5, %6, %4](%0[%c0 to %3 for %3], %1, %2, %5, %6, %4) : (!stream.resource<*>{%3}, index, index, index, index, index) -> !stream.resource<*>{%7} | |
%9 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2x2x2xi32> : index | |
%10 = stream.async.transfer %8 : !stream.resource<*>{%9} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%9} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2x2x2xi32> in !stream.resource<external>{%9} -> tensor<2x2x2x2xi32> | |
%12 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2x2x2xi32> in !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%11, %13) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_2x2x2x2xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2x2x2xi32> in !stream.resource<constant> = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_2x2x2x2xi32__size : index | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_4x4xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<4x4xi32> in !stream.resource<constant> = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_4x4xi32__size : index | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__size = util.global.load immutable @__constant_tensor_2x2x2x2xi32__size : index | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__size = util.global.load immutable @__constant_tensor_4x4xi32__size : index | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%__constant_tensor_4x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_4x4xi32__size} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?xi32>{%1, %2} : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xi32>{%5, %6, %4, %4} : index | |
%8 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %5, %6, %4](%0[%c0 to %3 for %3], %1, %2, %5, %6, %4) : (!stream.resource<*>{%3}, index, index, index, index, index) -> !stream.resource<*>{%7} | |
%9 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2x2x2xi32> : index | |
%10 = stream.async.transfer %8 : !stream.resource<*>{%9} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%9} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2x2x2xi32> in !stream.resource<external>{%9} -> tensor<2x2x2x2xi32> | |
%12 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2x2x2xi32> in !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%11, %13) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_2x2x2x2xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2x2x2xi32> in !stream.resource<constant> = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_2x2x2x2xi32__size : index | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_4x4xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<4x4xi32> in !stream.resource<constant> = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_4x4xi32__size : index | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__size = util.global.load immutable @__constant_tensor_2x2x2x2xi32__size : index | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__size = util.global.load immutable @__constant_tensor_4x4xi32__size : index | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%__constant_tensor_4x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_4x4xi32__size} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?xi32>{%1, %2} : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xi32>{%5, %6, %4, %4} : index | |
%8 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %5, %6, %4](%0[%c0 to %3 for %3], %1, %2, %5, %6, %4) : (!stream.resource<*>{%3}, index, index, index, index, index) -> !stream.resource<*>{%7} | |
%9 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2x2x2xi32> : index | |
%10 = stream.async.transfer %8 : !stream.resource<*>{%9} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%9} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2x2x2xi32> in !stream.resource<external>{%9} -> tensor<2x2x2x2xi32> | |
%12 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2x2x2xi32> in !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%11, %13) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After CombineInitializers (iree-util-combine-initializers) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_2x2x2x2xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2x2x2xi32> in !stream.resource<constant> = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_2x2x2x2xi32__size : index | |
%cst_0 = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<4x4xi32> in !stream.resource<constant> = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
%1 = stream.resource.size %cst_0 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %1, @__constant_tensor_4x4xi32__size : index | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_4x4xi32__size : index | |
util.func private @_fully_dynamic_pack_simple() { | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__size = util.global.load immutable @__constant_tensor_2x2x2x2xi32__size : index | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__size = util.global.load immutable @__constant_tensor_4x4xi32__size : index | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%__constant_tensor_4x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_4x4xi32__size} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?xi32>{%1, %2} : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %1, %4 : index | |
%6 = arith.ceildivui %2, %4 : index | |
%7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xi32>{%5, %6, %4, %4} : index | |
%8 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %5, %6, %4](%0[%c0 to %3 for %3], %1, %2, %5, %6, %4) : (!stream.resource<*>{%3}, index, index, index, index, index) -> !stream.resource<*>{%7} | |
%9 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2x2x2xi32> : index | |
%10 = stream.async.transfer %8 : !stream.resource<*>{%9} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%9} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2x2x2xi32> in !stream.resource<external>{%9} -> tensor<2x2x2x2xi32> | |
%12 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2x2x2xi32> in !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%11, %13) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- // | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %c64, @__constant_tensor_2x2x2x2xi32__size : index | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %c64, @__constant_tensor_4x4xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- // | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
// -----// IR Dump After EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__size = util.global.load immutable @__constant_tensor_2x2x2x2xi32__size : index | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__size = util.global.load immutable @__constant_tensor_4x4xi32__size : index | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%__constant_tensor_4x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_4x4xi32__size} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = arith.muli %1, %c4 : index | |
%4 = arith.muli %3, %2 : index | |
%5 = util.optimization_barrier %c2 : index | |
%6 = arith.ceildivui %1, %5 : index | |
%7 = arith.ceildivui %2, %5 : index | |
%8 = arith.muli %6, %c4 : index | |
%9 = arith.muli %8, %7 : index | |
%10 = arith.muli %9, %5 : index | |
%11 = arith.muli %10, %5 : index | |
%12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %6, %7, %5](%0[%c0 to %4 for %4], %1, %2, %6, %7, %5) : (!stream.resource<*>{%4}, index, index, index, index, index) -> !stream.resource<*>{%11} | |
%13 = stream.async.transfer %12 : !stream.resource<*>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%15 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} | |
%16 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %16) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %c64, @__constant_tensor_2x2x2x2xi32__size : index | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %c64, @__constant_tensor_4x4xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %c64, @__constant_tensor_2x2x2x2xi32__size : index | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %c64, @__constant_tensor_4x4xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %c64, @__constant_tensor_2x2x2x2xi32__size : index | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %c64, @__constant_tensor_4x4xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__size = util.global.load immutable @__constant_tensor_2x2x2x2xi32__size : index | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__size = util.global.load immutable @__constant_tensor_4x4xi32__size : index | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%__constant_tensor_4x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_4x4xi32__size} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = arith.muli %1, %c4 : index | |
%4 = arith.muli %3, %2 : index | |
%5 = util.optimization_barrier %c2 : index | |
%6 = arith.ceildivui %1, %5 : index | |
%7 = arith.ceildivui %2, %5 : index | |
%8 = arith.muli %6, %c4 : index | |
%9 = arith.muli %8, %7 : index | |
%10 = arith.muli %9, %5 : index | |
%11 = arith.muli %10, %5 : index | |
%12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %6, %7, %5](%0[%c0 to %4 for %4], %1, %2, %6, %7, %5) : (!stream.resource<*>{%4}, index, index, index, index, index) -> !stream.resource<*>{%11} | |
%13 = stream.async.transfer %12 : !stream.resource<*>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%15 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} | |
%16 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %16) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__size = util.global.load immutable @__constant_tensor_2x2x2x2xi32__size : index | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__size = util.global.load immutable @__constant_tensor_4x4xi32__size : index | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%__constant_tensor_4x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_4x4xi32__size} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = arith.muli %1, %c4 : index | |
%4 = arith.muli %3, %2 : index | |
%5 = util.optimization_barrier %c2 : index | |
%6 = arith.ceildivui %1, %5 : index | |
%7 = arith.ceildivui %2, %5 : index | |
%8 = arith.muli %6, %c4 : index | |
%9 = arith.muli %8, %7 : index | |
%10 = arith.muli %9, %5 : index | |
%11 = arith.muli %10, %5 : index | |
%12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %6, %7, %5](%0[%c0 to %4 for %4], %1, %2, %6, %7, %5) : (!stream.resource<*>{%4}, index, index, index, index, index) -> !stream.resource<*>{%11} | |
%13 = stream.async.transfer %12 : !stream.resource<*>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%15 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} | |
%16 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %16) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__size = util.global.load immutable @__constant_tensor_2x2x2x2xi32__size : index | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__size = util.global.load immutable @__constant_tensor_4x4xi32__size : index | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%__constant_tensor_4x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_4x4xi32__size} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = arith.muli %1, %c4 : index | |
%4 = arith.muli %3, %2 : index | |
%5 = util.optimization_barrier %c2 : index | |
%6 = arith.ceildivui %1, %5 : index | |
%7 = arith.ceildivui %2, %5 : index | |
%8 = arith.muli %6, %c4 : index | |
%9 = arith.muli %8, %7 : index | |
%10 = arith.muli %9, %5 : index | |
%11 = arith.muli %10, %5 : index | |
%12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %6, %7, %5](%0[%c0 to %4 for %4], %1, %2, %6, %7, %5) : (!stream.resource<*>{%4}, index, index, index, index, index) -> !stream.resource<*>{%11} | |
%13 = stream.async.transfer %12 : !stream.resource<*>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%15 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} | |
%16 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %16) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_2x2x2x2xi32__size = 64 : index | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_4x4xi32__size = 64 : index | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__size = util.global.load immutable @__constant_tensor_2x2x2x2xi32__size : index | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__size = util.global.load immutable @__constant_tensor_4x4xi32__size : index | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%__constant_tensor_4x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_4x4xi32__size} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = arith.muli %1, %c4 : index | |
%4 = arith.muli %3, %2 : index | |
%5 = util.optimization_barrier %c2 : index | |
%6 = arith.ceildivui %1, %5 : index | |
%7 = arith.ceildivui %2, %5 : index | |
%8 = arith.muli %6, %c4 : index | |
%9 = arith.muli %8, %7 : index | |
%10 = arith.muli %9, %5 : index | |
%11 = arith.muli %10, %5 : index | |
%12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %6, %7, %5](%0[%c0 to %4 for %4], %1, %2, %6, %7, %5) : (!stream.resource<*>{%4}, index, index, index, index, index) -> !stream.resource<*>{%11} | |
%13 = stream.async.transfer %12 : !stream.resource<*>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%15 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} | |
%16 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%__constant_tensor_2x2x2x2xi32__size} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %16) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c64} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = arith.muli %1, %c4 : index | |
%4 = arith.muli %3, %2 : index | |
%5 = util.optimization_barrier %c2 : index | |
%6 = arith.ceildivui %1, %5 : index | |
%7 = arith.ceildivui %2, %5 : index | |
%8 = arith.muli %6, %c4 : index | |
%9 = arith.muli %8, %7 : index | |
%10 = arith.muli %9, %5 : index | |
%11 = arith.muli %10, %5 : index | |
%12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %6, %7, %5](%0[%c0 to %4 for %4], %1, %2, %6, %7, %5) : (!stream.resource<*>{%4}, index, index, index, index, index) -> !stream.resource<*>{%11} | |
%13 = stream.async.transfer %12 : !stream.resource<*>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%15 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%16 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %16) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c64} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = arith.muli %1, %c4 : index | |
%4 = arith.muli %3, %2 : index | |
%5 = util.optimization_barrier %c2 : index | |
%6 = arith.ceildivui %1, %5 : index | |
%7 = arith.ceildivui %2, %5 : index | |
%8 = arith.muli %6, %c4 : index | |
%9 = arith.muli %8, %7 : index | |
%10 = arith.muli %9, %5 : index | |
%11 = arith.muli %10, %5 : index | |
%12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %6, %7, %5](%0[%c0 to %4 for %4], %1, %2, %6, %7, %5) : (!stream.resource<*>{%4}, index, index, index, index, index) -> !stream.resource<*>{%11} | |
%13 = stream.async.transfer %12 : !stream.resource<*>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%15 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%16 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %16) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c64} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = arith.muli %1, %c4 : index | |
%4 = arith.muli %3, %2 : index | |
%5 = util.optimization_barrier %c2 : index | |
%6 = arith.ceildivui %1, %5 : index | |
%7 = arith.ceildivui %2, %5 : index | |
%8 = arith.muli %6, %c4 : index | |
%9 = arith.muli %8, %7 : index | |
%10 = arith.muli %9, %5 : index | |
%11 = arith.muli %10, %5 : index | |
%12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %6, %7, %5](%0[%c0 to %4 for %4], %1, %2, %6, %7, %5) : (!stream.resource<*>{%4}, index, index, index, index, index) -> !stream.resource<*>{%11} | |
%13 = stream.async.transfer %12 : !stream.resource<*>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%15 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%16 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %16) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToAsyncResourcesPass (iree-stream-verify-lowering-to-async-resources) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c64} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = arith.muli %1, %c4 : index | |
%4 = arith.muli %3, %2 : index | |
%5 = util.optimization_barrier %c2 : index | |
%6 = arith.ceildivui %1, %5 : index | |
%7 = arith.ceildivui %2, %5 : index | |
%8 = arith.muli %6, %c4 : index | |
%9 = arith.muli %8, %7 : index | |
%10 = arith.muli %9, %5 : index | |
%11 = arith.muli %10, %5 : index | |
%12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %6, %7, %5](%0[%c0 to %4 for %4], %1, %2, %6, %7, %5) : (!stream.resource<*>{%4}, index, index, index, index, index) -> !stream.resource<*>{%11} | |
%13 = stream.async.transfer %12 : !stream.resource<*>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%15 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%16 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %16) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- // | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c64} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = arith.muli %1, %c4 : index | |
%4 = arith.muli %3, %2 : index | |
%5 = util.optimization_barrier %c2 : index | |
%6 = arith.ceildivui %1, %5 : index | |
%7 = arith.ceildivui %2, %5 : index | |
%8 = arith.muli %6, %c4 : index | |
%9 = arith.muli %8, %7 : index | |
%10 = arith.muli %9, %5 : index | |
%11 = arith.muli %10, %5 : index | |
%12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %6, %7, %5](%0[%c0 to %4 for %4], %1, %2, %6, %7, %5) : (!stream.resource<*>{%4}, index, index, index, index, index) -> !stream.resource<*>{%11} | |
%13 = stream.async.transfer %12 : !stream.resource<*>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%15 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%16 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %16) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c64} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = arith.muli %1, %c4 : index | |
%4 = arith.muli %3, %2 : index | |
%5 = util.optimization_barrier %c2 : index | |
%6 = arith.ceildivui %1, %5 : index | |
%7 = arith.ceildivui %2, %5 : index | |
%8 = arith.muli %6, %c4 : index | |
%9 = arith.muli %8, %7 : index | |
%10 = arith.muli %9, %5 : index | |
%11 = arith.muli %10, %5 : index | |
%12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %6, %7, %5](%0[%c0 to %4 for %4], %1, %2, %6, %7, %5) : (!stream.resource<*>{%4}, index, index, index, index, index) -> !stream.resource<*>{%11} | |
%13 = stream.async.transfer %12 : !stream.resource<*>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%15 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%16 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %16) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c64} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = arith.muli %1, %c4 : index | |
%4 = arith.muli %3, %2 : index | |
%5 = util.optimization_barrier %c2 : index | |
%6 = arith.ceildivui %1, %5 : index | |
%7 = arith.ceildivui %2, %5 : index | |
%8 = arith.muli %6, %c4 : index | |
%9 = arith.muli %8, %7 : index | |
%10 = arith.muli %9, %5 : index | |
%11 = arith.muli %10, %5 : index | |
%12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %6, %7, %5](%0[%c0 to %4 for %4], %1, %2, %6, %7, %5) : (!stream.resource<*>{%4}, index, index, index, index, index) -> !stream.resource<*>{%11} | |
%13 = stream.async.transfer %12 : !stream.resource<*>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%15 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%16 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %16) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- // | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c64} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = arith.muli %1, %c4 : index | |
%4 = arith.muli %3, %2 : index | |
%5 = util.optimization_barrier %c2 : index | |
%6 = arith.ceildivui %1, %5 : index | |
%7 = arith.ceildivui %2, %5 : index | |
%8 = arith.muli %6, %c4 : index | |
%9 = arith.muli %8, %7 : index | |
%10 = arith.muli %9, %5 : index | |
%11 = arith.muli %10, %5 : index | |
%12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %6, %7, %5](%0[%c0 to %4 for %4], %1, %2, %6, %7, %5) : (!stream.resource<*>{%4}, index, index, index, index, index) -> !stream.resource<*>{%11} | |
%13 = stream.async.transfer %12 : !stream.resource<*>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%15 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%16 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %16) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_4x4xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c64} | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c4 : index | |
%3 = arith.muli %1, %c4 : index | |
%4 = arith.muli %3, %2 : index | |
%5 = util.optimization_barrier %c2 : index | |
%6 = arith.ceildivui %1, %5 : index | |
%7 = arith.ceildivui %2, %5 : index | |
%8 = arith.muli %6, %c4 : index | |
%9 = arith.muli %8, %7 : index | |
%10 = arith.muli %9, %5 : index | |
%11 = arith.muli %10, %5 : index | |
%12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%1, %2, %6, %7, %5](%0[%c0 to %4 for %4], %1, %2, %6, %7, %5) : (!stream.resource<*>{%4}, index, index, index, index, index) -> !stream.resource<*>{%11} | |
%13 = stream.async.transfer %12 : !stream.resource<*>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%15 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%16 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %16) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After RefineUsagePass (iree-stream-refine-usage) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%__constant_tensor_4x4xi32[%c0 to %3 for %3], %0, %1, %5, %6, %4) : (!stream.resource<constant>{%3}, index, index, index, index, index) -> !stream.resource<external>{%10} | |
%12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%13 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%12, %14) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%__constant_tensor_4x4xi32[%c0 to %3 for %3], %0, %1, %5, %6, %4) : (!stream.resource<constant>{%3}, index, index, index, index, index) -> !stream.resource<external>{%10} | |
%12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%13 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%12, %14) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%__constant_tensor_4x4xi32[%c0 to %3 for %3], %0, %1, %5, %6, %4) : (!stream.resource<constant>{%3}, index, index, index, index, index) -> !stream.resource<external>{%10} | |
%12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%13 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%12, %14) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%__constant_tensor_4x4xi32[%c0 to %3 for %3], %0, %1, %5, %6, %4) : (!stream.resource<constant>{%3}, index, index, index, index, index) -> !stream.resource<external>{%10} | |
%12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%13 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%12, %14) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%__constant_tensor_4x4xi32[%c0 to %3 for %3], %0, %1, %5, %6, %4) : (!stream.resource<constant>{%3}, index, index, index, index, index) -> !stream.resource<external>{%10} | |
%12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%13 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%12, %14) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%__constant_tensor_4x4xi32[%c0 to %3 for %3], %0, %1, %5, %6, %4) : (!stream.resource<constant>{%3}, index, index, index, index, index) -> !stream.resource<external>{%10} | |
%12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%13 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%12, %14) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%__constant_tensor_4x4xi32[%c0 to %3 for %3], %0, %1, %5, %6, %4) : (!stream.resource<constant>{%3}, index, index, index, index, index) -> !stream.resource<external>{%10} | |
%12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%13 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%12, %14) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%__constant_tensor_4x4xi32[%c0 to %3 for %3], %0, %1, %5, %6, %4) : (!stream.resource<constant>{%3}, index, index, index, index, index) -> !stream.resource<external>{%10} | |
%12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%13 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%12, %14) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After VerifyAsyncAccessRangesPass (iree-stream-verify-async-access-ranges) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
util.global.store %cst, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%__constant_tensor_4x4xi32[%c0 to %3 for %3], %0, %1, %5, %6, %4) : (!stream.resource<constant>{%3}, index, index, index, index, index) -> !stream.resource<external>{%10} | |
%12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%13 = stream.async.transfer %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c64} | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%12, %14) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ScheduleExecutionPass (iree-stream-schedule-execution) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ScheduleExecutionPass (iree-stream-schedule-execution) //----- // | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c64}, !stream.resource<constant>{%c64}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c64}, !stream.resource<constant>{%c64} | |
} => !stream.timepoint | |
%0:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<constant>{%c64}, !stream.resource<constant>{%c64} | |
util.global.store %0#0, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %0#1, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After ScheduleExecutionPass (iree-stream-schedule-execution) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%3}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%10}, !stream.resource<external>{%c64}) { | |
%14 = stream.async.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%arg0[%c0 to %3 for %3], %0, %1, %5, %6, %4) : (!stream.resource<constant>{%3}, index, index, index, index, index) -> !stream.resource<external>{%10} | |
%15 = stream.async.transfer %arg1 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> !stream.resource<external>{%c64} | |
stream.yield %14, %15 : !stream.resource<external>{%10}, !stream.resource<external>{%c64} | |
} => !stream.timepoint | |
%11:2 = stream.timepoint.await %result_timepoint => %results#1, %results#0 : !stream.resource<external>{%c64}, !stream.resource<external>{%10} | |
%12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11#0 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11#1 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%13, %12) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- // | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c64}, !stream.resource<constant>{%c64}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c64}, !stream.resource<constant>{%c64} | |
} => !stream.timepoint | |
%0:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<constant>{%c64}, !stream.resource<constant>{%c64} | |
util.global.store %0#0, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %0#1, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%3}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%10}, !stream.resource<external>{%c64}) { | |
%14:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<constant>{%3}, %arg1 as %arg3: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%10}, !stream.resource<external>{%c64}) { | |
%15 = stream.async.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%arg2[%c0 to %3 for %3], %0, %1, %5, %6, %4) : (!stream.resource<constant>{%3}, index, index, index, index, index) -> !stream.resource<external>{%10} | |
%16 = stream.async.transfer %arg3 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> !stream.resource<external>{%c64} | |
stream.yield %15, %16 : !stream.resource<external>{%10}, !stream.resource<external>{%c64} | |
} | |
stream.yield %14#0, %14#1 : !stream.resource<external>{%10}, !stream.resource<external>{%c64} | |
} => !stream.timepoint | |
%11:2 = stream.timepoint.await %result_timepoint => %results#1, %results#0 : !stream.resource<external>{%c64}, !stream.resource<external>{%10} | |
%12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11#0 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11#1 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%13, %12) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After PropagateTimepointsPass (iree-stream-propagate-timepoints) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private mutable @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c64}, !stream.resource<constant>{%c64}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c64}, !stream.resource<constant>{%c64} | |
} => !stream.timepoint | |
%0:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<constant>{%c64}, !stream.resource<constant>{%c64} | |
util.global.store %result_timepoint, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.global.store %results#0, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %result_timepoint, @__constant_tensor_4x4xi32__timepoint : !stream.timepoint | |
util.global.store %results#1, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private mutable @__constant_tensor_4x4xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = stream.timepoint.await %__constant_tensor_2x2x2x2xi32__timepoint => %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} | |
%__constant_tensor_4x4xi32__timepoint = util.global.load @__constant_tensor_4x4xi32__timepoint : !stream.timepoint | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%1 = stream.resource.size %__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%2 = stream.timepoint.await %__constant_tensor_4x4xi32__timepoint => %__constant_tensor_4x4xi32 : !stream.resource<constant>{%1} | |
%3 = util.optimization_barrier %c4 : index | |
%4 = util.optimization_barrier %c4 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = util.optimization_barrier %c2 : index | |
%8 = arith.ceildivui %3, %7 : index | |
%9 = arith.ceildivui %4, %7 : index | |
%10 = arith.muli %8, %c4 : index | |
%11 = arith.muli %10, %9 : index | |
%12 = arith.muli %11, %7 : index | |
%13 = arith.muli %12, %7 : index | |
%14 = stream.timepoint.join max(%__constant_tensor_4x4xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint) => !stream.timepoint | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%14) => with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%6}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%13}, !stream.resource<external>{%c64}) { | |
%18:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<constant>{%6}, %arg1 as %arg3: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%13}, !stream.resource<external>{%c64}) { | |
%19 = stream.async.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%3, %4, %8, %9, %7](%arg2[%c0 to %6 for %6], %3, %4, %8, %9, %7) : (!stream.resource<constant>{%6}, index, index, index, index, index) -> !stream.resource<external>{%13} | |
%20 = stream.async.transfer %arg3 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> !stream.resource<external>{%c64} | |
stream.yield %19, %20 : !stream.resource<external>{%13}, !stream.resource<external>{%c64} | |
} | |
stream.yield %18#0, %18#1 : !stream.resource<external>{%13}, !stream.resource<external>{%c64} | |
} => !stream.timepoint | |
%15:2 = stream.timepoint.await %result_timepoint => %results#1, %results#0 : !stream.resource<external>{%c64}, !stream.resource<external>{%13} | |
%16 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15#0 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%17 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15#1 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%17, %16) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After MaterializeBuiltinsPass (iree-stream-materialize-builtins) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private mutable @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c64}, !stream.resource<constant>{%c64}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c64}, !stream.resource<constant>{%c64} | |
} => !stream.timepoint | |
%0:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<constant>{%c64}, !stream.resource<constant>{%c64} | |
util.global.store %result_timepoint, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.global.store %results#0, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %result_timepoint, @__constant_tensor_4x4xi32__timepoint : !stream.timepoint | |
util.global.store %results#1, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private mutable @__constant_tensor_4x4xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = stream.timepoint.await %__constant_tensor_2x2x2x2xi32__timepoint => %__constant_tensor_2x2x2x2xi32 : !stream.resource<constant>{%c64} | |
%__constant_tensor_4x4xi32__timepoint = util.global.load @__constant_tensor_4x4xi32__timepoint : !stream.timepoint | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%1 = stream.resource.size %__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%2 = stream.timepoint.await %__constant_tensor_4x4xi32__timepoint => %__constant_tensor_4x4xi32 : !stream.resource<constant>{%1} | |
%3 = util.optimization_barrier %c4 : index | |
%4 = util.optimization_barrier %c4 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = util.optimization_barrier %c2 : index | |
%8 = arith.ceildivui %3, %7 : index | |
%9 = arith.ceildivui %4, %7 : index | |
%10 = arith.muli %8, %c4 : index | |
%11 = arith.muli %10, %9 : index | |
%12 = arith.muli %11, %7 : index | |
%13 = arith.muli %12, %7 : index | |
%14 = stream.timepoint.join max(%__constant_tensor_4x4xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint) => !stream.timepoint | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%14) => with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%6}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%13}, !stream.resource<external>{%c64}) { | |
%18:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<constant>{%6}, %arg1 as %arg3: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%13}, !stream.resource<external>{%c64}) { | |
%19 = stream.async.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%3, %4, %8, %9, %7](%arg2[%c0 to %6 for %6], %3, %4, %8, %9, %7) : (!stream.resource<constant>{%6}, index, index, index, index, index) -> !stream.resource<external>{%13} | |
%20 = stream.async.transfer %arg3 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> !stream.resource<external>{%c64} | |
stream.yield %19, %20 : !stream.resource<external>{%13}, !stream.resource<external>{%c64} | |
} | |
stream.yield %18#0, %18#1 : !stream.resource<external>{%13}, !stream.resource<external>{%c64} | |
} => !stream.timepoint | |
%15:2 = stream.timepoint.await %result_timepoint => %results#1, %results#0 : !stream.resource<external>{%c64}, !stream.resource<external>{%13} | |
%16 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15#0 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%17 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15#1 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%17, %16) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c64}, !stream.resource<constant>{%c64}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c64}, !stream.resource<constant>{%c64} | |
} => !stream.timepoint | |
util.global.store %result_timepoint, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.global.store %results#0, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %result_timepoint, @__constant_tensor_4x4xi32__timepoint : !stream.timepoint | |
util.global.store %results#1, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c64}, !stream.resource<constant>{%c64}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c64}, !stream.resource<constant>{%c64} | |
} => !stream.timepoint | |
util.global.store %result_timepoint, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.global.store %results#0, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %result_timepoint, @__constant_tensor_4x4xi32__timepoint : !stream.timepoint | |
util.global.store %results#1, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c64}, !stream.resource<constant>{%c64}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c64}, !stream.resource<constant>{%c64} | |
} => !stream.timepoint | |
util.global.store %results#0, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %result_timepoint, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.global.store %results#1, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %result_timepoint, @__constant_tensor_4x4xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__timepoint = util.global.load @__constant_tensor_4x4xi32__timepoint : !stream.timepoint | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.timepoint.join max(%__constant_tensor_4x4xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint) => !stream.timepoint | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%11) => with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%3}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%10}, !stream.resource<external>{%c64}) { | |
%15:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<constant>{%3}, %arg1 as %arg3: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%10}, !stream.resource<external>{%c64}) { | |
%16 = stream.async.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%arg2[%c0 to %3 for %3], %0, %1, %5, %6, %4) : (!stream.resource<constant>{%3}, index, index, index, index, index) -> !stream.resource<external>{%10} | |
%17 = stream.async.transfer %arg3 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> !stream.resource<external>{%c64} | |
stream.yield %16, %17 : !stream.resource<external>{%10}, !stream.resource<external>{%c64} | |
} | |
stream.yield %15#0, %15#1 : !stream.resource<external>{%10}, !stream.resource<external>{%c64} | |
} => !stream.timepoint | |
%12:2 = stream.timepoint.await %result_timepoint => %results#1, %results#0 : !stream.resource<external>{%c64}, !stream.resource<external>{%10} | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12#0 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12#1 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %13) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__timepoint = util.global.load @__constant_tensor_4x4xi32__timepoint : !stream.timepoint | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.timepoint.join max(%__constant_tensor_4x4xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint) => !stream.timepoint | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%11) => with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%3}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%10}, !stream.resource<external>{%c64}) { | |
%15:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<constant>{%3}, %arg1 as %arg3: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%10}, !stream.resource<external>{%c64}) { | |
%16 = stream.async.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%arg2[%c0 to %3 for %3], %0, %1, %5, %6, %4) : (!stream.resource<constant>{%3}, index, index, index, index, index) -> !stream.resource<external>{%10} | |
%17 = stream.async.transfer %arg3 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> !stream.resource<external>{%c64} | |
stream.yield %16, %17 : !stream.resource<external>{%10}, !stream.resource<external>{%c64} | |
} | |
stream.yield %15#0, %15#1 : !stream.resource<external>{%10}, !stream.resource<external>{%c64} | |
} => !stream.timepoint | |
%12:2 = stream.timepoint.await %result_timepoint => %results#1, %results#0 : !stream.resource<external>{%c64}, !stream.resource<external>{%10} | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12#0 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12#1 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %13) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%__constant_tensor_4x4xi32__timepoint = util.global.load @__constant_tensor_4x4xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.timepoint.join max(%__constant_tensor_4x4xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint) => !stream.timepoint | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%11) => with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%3}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%10}, !stream.resource<external>{%c64}) { | |
%15:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<constant>{%3}, %arg1 as %arg3: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%10}, !stream.resource<external>{%c64}) { | |
%16 = stream.async.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%arg2[%c0 to %3 for %3], %0, %1, %5, %6, %4) : (!stream.resource<constant>{%3}, index, index, index, index, index) -> !stream.resource<external>{%10} | |
%17 = stream.async.transfer %arg3 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> !stream.resource<external>{%c64} | |
stream.yield %16, %17 : !stream.resource<external>{%10}, !stream.resource<external>{%c64} | |
} | |
stream.yield %15#0, %15#1 : !stream.resource<external>{%10}, !stream.resource<external>{%c64} | |
} => !stream.timepoint | |
%12:2 = stream.timepoint.await %result_timepoint => %results#1, %results#0 : !stream.resource<external>{%c64}, !stream.resource<external>{%10} | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12#0 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12#1 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %13) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private mutable @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c64}, !stream.resource<constant>{%c64}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c64}, !stream.resource<constant>{%c64} | |
} => !stream.timepoint | |
util.global.store %results#0, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %result_timepoint, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.global.store %results#1, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %result_timepoint, @__constant_tensor_4x4xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.global private mutable @__constant_tensor_4x4xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_4x4xi32__timepoint = util.global.load @__constant_tensor_4x4xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.timepoint.join max(%__constant_tensor_4x4xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint) => !stream.timepoint | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%11) => with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%3}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%10}, !stream.resource<external>{%c64}) { | |
%15:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<constant>{%3}, %arg1 as %arg3: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%10}, !stream.resource<external>{%c64}) { | |
%16 = stream.async.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%arg2[%c0 to %3 for %3], %0, %1, %5, %6, %4) : (!stream.resource<constant>{%3}, index, index, index, index, index) -> !stream.resource<external>{%10} | |
%17 = stream.async.transfer %arg3 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> !stream.resource<external>{%c64} | |
stream.yield %16, %17 : !stream.resource<external>{%10}, !stream.resource<external>{%c64} | |
} | |
stream.yield %15#0, %15#1 : !stream.resource<external>{%10}, !stream.resource<external>{%c64} | |
} => !stream.timepoint | |
%12:2 = stream.timepoint.await %result_timepoint => %results#1, %results#0 : !stream.resource<external>{%c64}, !stream.resource<external>{%10} | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12#0 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12#1 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %13) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c64}, !stream.resource<constant>{%c64}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c64}, !stream.resource<constant>{%c64} | |
} => !stream.timepoint | |
util.global.store %results#0, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %result_timepoint, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.global.store %results#1, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %result_timepoint, @__constant_tensor_4x4xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_4x4xi32__timepoint = util.global.load immutable @__constant_tensor_4x4xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.timepoint.join max(%__constant_tensor_4x4xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint) => !stream.timepoint | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%11) => with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%3}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%10}, !stream.resource<external>{%c64}) { | |
%15:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<constant>{%3}, %arg1 as %arg3: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%10}, !stream.resource<external>{%c64}) { | |
%16 = stream.async.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%arg2[%c0 to %3 for %3], %0, %1, %5, %6, %4) : (!stream.resource<constant>{%3}, index, index, index, index, index) -> !stream.resource<external>{%10} | |
%17 = stream.async.transfer %arg3 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> !stream.resource<external>{%c64} | |
stream.yield %16, %17 : !stream.resource<external>{%10}, !stream.resource<external>{%c64} | |
} | |
stream.yield %15#0, %15#1 : !stream.resource<external>{%10}, !stream.resource<external>{%c64} | |
} => !stream.timepoint | |
%12:2 = stream.timepoint.await %result_timepoint => %results#1, %results#0 : !stream.resource<external>{%c64}, !stream.resource<external>{%10} | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12#0 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12#1 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %13) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c64}, !stream.resource<constant>{%c64}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c64}, !stream.resource<constant>{%c64} | |
} => !stream.timepoint | |
util.global.store %results#0, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %result_timepoint, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.global.store %results#1, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32__timepoint_0 = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint_0) => !stream.timepoint | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%11) => with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%3}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%10}, !stream.resource<external>{%c64}) { | |
%15:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<constant>{%3}, %arg1 as %arg3: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%10}, !stream.resource<external>{%c64}) { | |
%16 = stream.async.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%arg2[%c0 to %3 for %3], %0, %1, %5, %6, %4) : (!stream.resource<constant>{%3}, index, index, index, index, index) -> !stream.resource<external>{%10} | |
%17 = stream.async.transfer %arg3 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> !stream.resource<external>{%c64} | |
stream.yield %16, %17 : !stream.resource<external>{%10}, !stream.resource<external>{%c64} | |
} | |
stream.yield %15#0, %15#1 : !stream.resource<external>{%10}, !stream.resource<external>{%c64} | |
} => !stream.timepoint | |
%12:2 = stream.timepoint.await %result_timepoint => %results#1, %results#0 : !stream.resource<external>{%c64}, !stream.resource<external>{%10} | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12#0 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12#1 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %13) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c64}, !stream.resource<constant>{%c64}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c64}, !stream.resource<constant>{%c64} | |
} => !stream.timepoint | |
util.global.store %results#0, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %result_timepoint, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.global.store %results#1, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32__timepoint_0 = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint_0) => !stream.timepoint | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%11) => with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%3}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%10}, !stream.resource<external>{%c64}) { | |
%15:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<constant>{%3}, %arg1 as %arg3: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%10}, !stream.resource<external>{%c64}) { | |
%16 = stream.async.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%arg2[%c0 to %3 for %3], %0, %1, %5, %6, %4) : (!stream.resource<constant>{%3}, index, index, index, index, index) -> !stream.resource<external>{%10} | |
%17 = stream.async.transfer %arg3 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> !stream.resource<external>{%c64} | |
stream.yield %16, %17 : !stream.resource<external>{%10}, !stream.resource<external>{%c64} | |
} | |
stream.yield %15#0, %15#1 : !stream.resource<external>{%10}, !stream.resource<external>{%c64} | |
} => !stream.timepoint | |
%12:2 = stream.timepoint.await %result_timepoint => %results#1, %results#0 : !stream.resource<external>{%c64}, !stream.resource<external>{%10} | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12#0 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12#1 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %13) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToAsyncPass (iree-stream-verify-lowering-to-async) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c64}, !stream.resource<constant>{%c64}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c64}, !stream.resource<constant>{%c64} | |
} => !stream.timepoint | |
util.global.store %results#0, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %result_timepoint, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.global.store %results#1, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32__timepoint_0 = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint_0) => !stream.timepoint | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%11) => with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%3}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%10}, !stream.resource<external>{%c64}) { | |
%15:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<constant>{%3}, %arg1 as %arg3: !stream.resource<constant>{%c64}) -> (!stream.resource<external>{%10}, !stream.resource<external>{%c64}) { | |
%16 = stream.async.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%arg2[%c0 to %3 for %3], %0, %1, %5, %6, %4) : (!stream.resource<constant>{%3}, index, index, index, index, index) -> !stream.resource<external>{%10} | |
%17 = stream.async.transfer %arg3 : !stream.resource<constant>{%c64} from(#hal.device.affinity<@__device_0>) -> !stream.resource<external>{%c64} | |
stream.yield %16, %17 : !stream.resource<external>{%10}, !stream.resource<external>{%c64} | |
} | |
stream.yield %15#0, %15#1 : !stream.resource<external>{%10}, !stream.resource<external>{%c64} | |
} => !stream.timepoint | |
%12:2 = stream.timepoint.await %result_timepoint => %results#1, %results#0 : !stream.resource<external>{%c64}, !stream.resource<external>{%10} | |
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12#0 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12#1 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%14, %13) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ScheduleAllocationPass (iree-stream-schedule-allocation) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%results:2, %result_timepoint = stream.resource.constants on(#hal.device.affinity<@__device_0>) : | |
!stream.resource<constant>{%c64} = dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
!stream.resource<constant>{%c64} = dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32> | |
=> !stream.timepoint | |
%0 = stream.cmd.execute once on(#hal.device.affinity<@__device_0>) with() { | |
} => !stream.timepoint | |
%1 = stream.timepoint.join max(%result_timepoint, %0) => !stream.timepoint | |
util.global.store %results#0, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %1, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.global.store %results#1, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32__timepoint_0 = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint_0) => !stream.timepoint | |
%c0_1 = arith.constant 0 : index | |
%12:3 = stream.resource.pack on(#hal.device.affinity<@__device_0>) slices({ | |
[0, 0] = %10, | |
[0, 0] = %c64 | |
}) : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%11) => !stream.resource<external>{%12#0} => !stream.timepoint | |
%13 = stream.resource.subview %result[%12#1] : !stream.resource<external>{%12#0} -> !stream.resource<external>{%10} | |
%14 = stream.resource.subview %result[%12#2] : !stream.resource<external>{%12#0} -> !stream.resource<external>{%c64} | |
%15 = stream.timepoint.join max(%11, %result_timepoint) => !stream.timepoint | |
%16 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%15) => with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%3}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c64}, %13 as %arg2: !stream.resource<external>{%10}, %14 as %arg3: !stream.resource<external>{%c64}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c0 for %3] : !stream.resource<constant>{%3}, | |
wo %arg2[%c0_1 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg1[%c0_1], %arg3[%c0_1], %c64 : !stream.resource<constant>{%c64} -> !stream.resource<external>{%c64} | |
stream.cmd.flush %arg3[%c0_1 for %c64] : !stream.resource<external>{%c64} | |
} | |
} => !stream.timepoint | |
%17:2 = stream.timepoint.await %16 => %14, %13 : !stream.resource<external>{%c64}, !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17#0 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17#1 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After PackConstantsPass (iree-stream-pack-constants) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After LayoutSlicesPass (iree-stream-layout-slices) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After PackConstantsPass (iree-stream-pack-constants) //----- // | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%c0_i64 = arith.constant 0 : i64 | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%6 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%7 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %6[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %7, %6 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.resource.subview %1#1[%c0] : !stream.resource<constant>{%c128} -> !stream.resource<constant>{%c64} | |
%3 = stream.resource.subview %1#1[%c64] : !stream.resource<constant>{%c128} -> !stream.resource<constant>{%c64} | |
%4 = stream.cmd.execute once on(#hal.device.affinity<@__device_0>) with() { | |
} => !stream.timepoint | |
%5 = stream.timepoint.join max(%1#0, %4) => !stream.timepoint | |
util.global.store %2, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %5, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.global.store %3, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After LayoutSlicesPass (iree-stream-layout-slices) //----- // | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%c0_i64 = arith.constant 0 : i64 | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%6 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%7 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %6[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %7, %6 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.resource.subview %1#1[%c0] : !stream.resource<constant>{%c128} -> !stream.resource<constant>{%c64} | |
%3 = stream.resource.subview %1#1[%c64] : !stream.resource<constant>{%c128} -> !stream.resource<constant>{%c64} | |
%4 = stream.cmd.execute once on(#hal.device.affinity<@__device_0>) with() { | |
} => !stream.timepoint | |
%5 = stream.timepoint.join max(%1#0, %4) => !stream.timepoint | |
util.global.store %2, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %5, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.global.store %3, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After PackConstantsPass (iree-stream-pack-constants) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32__timepoint_0 = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint_0) => !stream.timepoint | |
%c0_1 = arith.constant 0 : index | |
%12:3 = stream.resource.pack on(#hal.device.affinity<@__device_0>) slices({ | |
[0, 0] = %10, | |
[0, 0] = %c64 | |
}) : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%11) => !stream.resource<external>{%12#0} => !stream.timepoint | |
%13 = stream.resource.subview %result[%12#1] : !stream.resource<external>{%12#0} -> !stream.resource<external>{%10} | |
%14 = stream.resource.subview %result[%12#2] : !stream.resource<external>{%12#0} -> !stream.resource<external>{%c64} | |
%15 = stream.timepoint.join max(%11, %result_timepoint) => !stream.timepoint | |
%16 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%15) => with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%3}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c64}, %13 as %arg2: !stream.resource<external>{%10}, %14 as %arg3: !stream.resource<external>{%c64}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c0 for %3] : !stream.resource<constant>{%3}, | |
wo %arg2[%c0_1 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg1[%c0_1], %arg3[%c0_1], %c64 : !stream.resource<constant>{%c64} -> !stream.resource<external>{%c64} | |
stream.cmd.flush %arg3[%c0_1 for %c64] : !stream.resource<external>{%c64} | |
} | |
} => !stream.timepoint | |
%17:2 = stream.timepoint.await %16 => %14, %13 : !stream.resource<external>{%c64}, !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17#0 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17#1 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After LayoutSlicesPass (iree-stream-layout-slices) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32__timepoint_0 = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint_0) => !stream.timepoint | |
%c0_1 = arith.constant 0 : index | |
%c0_2 = arith.constant 0 : index | |
%c64_3 = arith.constant 64 : index | |
%c64_4 = arith.constant 64 : index | |
%c64_5 = arith.constant 64 : index | |
%12 = util.align %10, %c64_5 : index | |
%13 = arith.addi %12, %c64_4 : index | |
%c64_6 = arith.constant 64 : index | |
%c64_7 = arith.constant 64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%11) => !stream.resource<external>{%13} => !stream.timepoint | |
%14 = stream.resource.subview %result[%c64_4] : !stream.resource<external>{%13} -> !stream.resource<external>{%10} | |
%15 = stream.resource.subview %result[%c0_2] : !stream.resource<external>{%13} -> !stream.resource<external>{%c64} | |
%16 = stream.timepoint.join max(%11, %result_timepoint) => !stream.timepoint | |
%17 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%16) => with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%3}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c64}, %14 as %arg2: !stream.resource<external>{%10}, %15 as %arg3: !stream.resource<external>{%c64}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c0 for %3] : !stream.resource<constant>{%3}, | |
wo %arg2[%c0_1 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg1[%c0_1], %arg3[%c0_1], %c64 : !stream.resource<constant>{%c64} -> !stream.resource<external>{%c64} | |
stream.cmd.flush %arg3[%c0_1 for %c64] : !stream.resource<external>{%c64} | |
} | |
} => !stream.timepoint | |
%18:2 = stream.timepoint.await %17 => %15, %14 : !stream.resource<external>{%c64}, !stream.resource<external>{%10} | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %18#0 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%20 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %18#1 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%20, %19) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After PropagateSubranges (iree-util-propagate-subranges) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global private mutable @__constant_tensor_2x2x2x2xi32__storage_size : index | |
util.global private mutable @__constant_tensor_2x2x2x2xi32__offset : index | |
util.global private mutable @__constant_tensor_2x2x2x2xi32__length : index | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%c128 = arith.constant 128 : index | |
%c0_0 = arith.constant 0 : index | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0_0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%c0_i64 = arith.constant 0 : i64 | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%6 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0_0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%7 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %6[%c0_0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %7, %6 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.resource.subview %1#1[%c0_0] : !stream.resource<constant>{%c128} -> !stream.resource<constant>{%c64} | |
%3 = stream.resource.subview %1#1[%c64] : !stream.resource<constant>{%c128} -> !stream.resource<constant>{%c64} | |
%4 = stream.cmd.execute once on(#hal.device.affinity<@__device_0>) with() { | |
} => !stream.timepoint | |
%5 = stream.timepoint.join max(%1#0, %4) => !stream.timepoint | |
util.global.store %1#1, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %c128, @__constant_tensor_2x2x2x2xi32__storage_size : index | |
util.global.store %c0_0, @__constant_tensor_2x2x2x2xi32__offset : index | |
util.global.store %c64, @__constant_tensor_2x2x2x2xi32__length : index | |
util.global.store %5, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.global.store %1#1, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %c128, @__constant_tensor_4x4xi32__storage_size : index | |
util.global.store %c64, @__constant_tensor_4x4xi32__offset : index | |
util.global.store %c64, @__constant_tensor_4x4xi32__length : index | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global private mutable @__constant_tensor_4x4xi32__storage_size : index | |
util.global private mutable @__constant_tensor_4x4xi32__offset : index | |
util.global private mutable @__constant_tensor_4x4xi32__length : index | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c0_0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32__timepoint_1 = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__storage_size = util.global.load @__constant_tensor_2x2x2x2xi32__storage_size : index | |
%__constant_tensor_2x2x2x2xi32__offset = util.global.load @__constant_tensor_2x2x2x2xi32__offset : index | |
%__constant_tensor_2x2x2x2xi32__length = util.global.load @__constant_tensor_2x2x2x2xi32__length : index | |
%0 = stream.resource.subview %__constant_tensor_2x2x2x2xi32[%__constant_tensor_2x2x2x2xi32__offset] : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__storage_size} -> !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__length} | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__storage_size = util.global.load @__constant_tensor_4x4xi32__storage_size : index | |
%__constant_tensor_4x4xi32__offset = util.global.load @__constant_tensor_4x4xi32__offset : index | |
%__constant_tensor_4x4xi32__length = util.global.load @__constant_tensor_4x4xi32__length : index | |
%1 = stream.resource.subview %__constant_tensor_4x4xi32[%__constant_tensor_4x4xi32__offset] : !stream.resource<constant>{%__constant_tensor_4x4xi32__storage_size} -> !stream.resource<constant>{%__constant_tensor_4x4xi32__length} | |
%2 = util.optimization_barrier %c4 : index | |
%3 = util.optimization_barrier %c4 : index | |
%4 = arith.muli %2, %c4 : index | |
%5 = arith.muli %4, %3 : index | |
%6 = util.optimization_barrier %c2 : index | |
%7 = arith.ceildivui %2, %6 : index | |
%8 = arith.ceildivui %3, %6 : index | |
%9 = arith.muli %7, %c4 : index | |
%10 = arith.muli %9, %8 : index | |
%11 = arith.muli %10, %6 : index | |
%12 = arith.muli %11, %6 : index | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint_1) => !stream.timepoint | |
%c0_2 = arith.constant 0 : index | |
%c0_3 = arith.constant 0 : index | |
%c64_4 = arith.constant 64 : index | |
%c64_5 = arith.constant 64 : index | |
%c64_6 = arith.constant 64 : index | |
%14 = util.align %12, %c64_6 : index | |
%15 = arith.addi %14, %c64_5 : index | |
%c64_7 = arith.constant 64 : index | |
%c64_8 = arith.constant 64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%13) => !stream.resource<external>{%15} => !stream.timepoint | |
%16 = stream.resource.subview %result[%c64_5] : !stream.resource<external>{%15} -> !stream.resource<external>{%12} | |
%17 = stream.resource.subview %result[%c0_3] : !stream.resource<external>{%15} -> !stream.resource<external>{%c64} | |
%18 = stream.timepoint.join max(%13, %result_timepoint) => !stream.timepoint | |
%19 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%18) => with(%1 as %arg0: !stream.resource<constant>{%5}, %0 as %arg1: !stream.resource<constant>{%c64}, %16 as %arg2: !stream.resource<external>{%12}, %17 as %arg3: !stream.resource<external>{%c64}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%2, %3, %7, %8, %6](%2, %3, %7, %8, %6 : index, index, index, index, index) { | |
ro %arg0[%c0_0 for %5] : !stream.resource<constant>{%5}, | |
wo %arg2[%c0_2 for %12] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg1[%c0_2], %arg3[%c0_2], %c64 : !stream.resource<constant>{%c64} -> !stream.resource<external>{%c64} | |
stream.cmd.flush %arg3[%c0_2 for %c64] : !stream.resource<external>{%c64} | |
} | |
} => !stream.timepoint | |
%20:2 = stream.timepoint.await %19 => %17, %16 : !stream.resource<external>{%c64}, !stream.resource<external>{%12} | |
%21 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %20#0 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%22 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %20#1 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%22, %21) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32__timepoint_0 = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__storage_size = util.global.load @__constant_tensor_2x2x2x2xi32__storage_size : index | |
%__constant_tensor_2x2x2x2xi32__offset = util.global.load @__constant_tensor_2x2x2x2xi32__offset : index | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__storage_size = util.global.load @__constant_tensor_4x4xi32__storage_size : index | |
%__constant_tensor_4x4xi32__offset = util.global.load @__constant_tensor_4x4xi32__offset : index | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint_0) => !stream.timepoint | |
%12 = util.align %10, %c64 : index | |
%13 = arith.addi %12, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%11) => !stream.resource<external>{%13} => !stream.timepoint | |
%14 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint_0, %result_timepoint) => !stream.timepoint | |
%15 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%14) => with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%__constant_tensor_4x4xi32__storage_size}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__storage_size}, %result as %arg2: !stream.resource<external>{%13}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%__constant_tensor_4x4xi32__offset for %3] : !stream.resource<constant>{%__constant_tensor_4x4xi32__storage_size}, | |
wo %arg2[%c64 for %10] : !stream.resource<external>{%13} | |
} | |
stream.cmd.copy %arg1[%__constant_tensor_2x2x2x2xi32__offset], %arg2[%c0], %c64 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__storage_size} -> !stream.resource<external>{%13} | |
stream.cmd.flush %arg2[%c0 for %c64] : !stream.resource<external>{%13} | |
} | |
} => !stream.timepoint | |
%16 = stream.timepoint.await %15 => %result : !stream.resource<external>{%13} | |
%17 = stream.resource.subview %16[%c0] : !stream.resource<external>{%13} -> !stream.resource<external>{%c64} | |
%18 = stream.resource.subview %16[%c64] : !stream.resource<external>{%13} -> !stream.resource<external>{%10} | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%20 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %18 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%20, %19) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant> | |
} | |
util.global.store %1#1, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %c128, @__constant_tensor_2x2x2x2xi32__storage_size : index | |
util.global.store %c0, @__constant_tensor_2x2x2x2xi32__offset : index | |
util.global.store %c64, @__constant_tensor_2x2x2x2xi32__length : index | |
util.global.store %1#0, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.global.store %1#1, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %c128, @__constant_tensor_4x4xi32__storage_size : index | |
util.global.store %c64, @__constant_tensor_4x4xi32__offset : index | |
util.global.store %c64, @__constant_tensor_4x4xi32__length : index | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant> | |
} | |
util.global.store %1#1, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %c128, @__constant_tensor_2x2x2x2xi32__storage_size : index | |
util.global.store %c0, @__constant_tensor_2x2x2x2xi32__offset : index | |
util.global.store %c64, @__constant_tensor_2x2x2x2xi32__length : index | |
util.global.store %1#0, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.global.store %1#1, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %c128, @__constant_tensor_4x4xi32__storage_size : index | |
util.global.store %c64, @__constant_tensor_4x4xi32__offset : index | |
util.global.store %c64, @__constant_tensor_4x4xi32__length : index | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant> | |
} | |
util.global.store %1#1, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %c64, @__constant_tensor_2x2x2x2xi32__length : index | |
util.global.store %c0, @__constant_tensor_2x2x2x2xi32__offset : index | |
util.global.store %c128, @__constant_tensor_2x2x2x2xi32__storage_size : index | |
util.global.store %1#0, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.global.store %1#1, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global.store %c64, @__constant_tensor_4x4xi32__length : index | |
util.global.store %c64, @__constant_tensor_4x4xi32__offset : index | |
util.global.store %c128, @__constant_tensor_4x4xi32__storage_size : index | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32__storage_size = util.global.load @__constant_tensor_2x2x2x2xi32__storage_size : index | |
%__constant_tensor_2x2x2x2xi32__offset = util.global.load @__constant_tensor_2x2x2x2xi32__offset : index | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32__storage_size = util.global.load @__constant_tensor_4x4xi32__storage_size : index | |
%__constant_tensor_4x4xi32__offset = util.global.load @__constant_tensor_4x4xi32__offset : index | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint) => !stream.timepoint | |
%12 = util.align %10, %c64 : index | |
%13 = arith.addi %12, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%11) => !stream.resource<external>{%13} => !stream.timepoint | |
%14 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%15 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%14) => with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%__constant_tensor_4x4xi32__storage_size}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__storage_size}, %result as %arg2: !stream.resource<external>{%13}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%__constant_tensor_4x4xi32__offset for %3] : !stream.resource<constant>{%__constant_tensor_4x4xi32__storage_size}, | |
wo %arg2[%c64 for %10] : !stream.resource<external>{%13} | |
} | |
stream.cmd.copy %arg1[%__constant_tensor_2x2x2x2xi32__offset], %arg2[%c0], %c64 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__storage_size} -> !stream.resource<external>{%13} | |
stream.cmd.flush %arg2[%c0 for %c64] : !stream.resource<external>{%13} | |
} | |
} => !stream.timepoint | |
%16 = stream.timepoint.await %15 => %result : !stream.resource<external>{%13} | |
%17 = stream.resource.subview %16[%c0] : !stream.resource<external>{%13} -> !stream.resource<external>{%c64} | |
%18 = stream.resource.subview %16[%c64] : !stream.resource<external>{%13} -> !stream.resource<external>{%10} | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%20 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %18 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%20, %19) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%__constant_tensor_4x4xi32__storage_size = util.global.load @__constant_tensor_4x4xi32__storage_size : index | |
%__constant_tensor_4x4xi32__offset = util.global.load @__constant_tensor_4x4xi32__offset : index | |
%__constant_tensor_2x2x2x2xi32__storage_size = util.global.load @__constant_tensor_2x2x2x2xi32__storage_size : index | |
%__constant_tensor_2x2x2x2xi32__offset = util.global.load @__constant_tensor_2x2x2x2xi32__offset : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint) => !stream.timepoint | |
%12 = util.align %10, %c64 : index | |
%13 = arith.addi %12, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%11) => !stream.resource<external>{%13} => !stream.timepoint | |
%14 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%15 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%14) => with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%__constant_tensor_4x4xi32__storage_size}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__storage_size}, %result as %arg2: !stream.resource<external>{%13}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%__constant_tensor_4x4xi32__offset for %3] : !stream.resource<constant>{%__constant_tensor_4x4xi32__storage_size}, | |
wo %arg2[%c64 for %10] : !stream.resource<external>{%13} | |
} | |
stream.cmd.copy %arg1[%__constant_tensor_2x2x2x2xi32__offset], %arg2[%c0], %c64 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__storage_size} -> !stream.resource<external>{%13} | |
stream.cmd.flush %arg2[%c0 for %c64] : !stream.resource<external>{%13} | |
} | |
} => !stream.timepoint | |
%16 = stream.timepoint.await %15 => %result : !stream.resource<external>{%13} | |
%17 = stream.resource.subview %16[%c0] : !stream.resource<external>{%13} -> !stream.resource<external>{%c64} | |
%18 = stream.resource.subview %16[%c64] : !stream.resource<external>{%13} -> !stream.resource<external>{%10} | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%20 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %18 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%20, %19) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global private mutable @__constant_tensor_2x2x2x2xi32__storage_size = 128 : index | |
util.global private mutable @__constant_tensor_2x2x2x2xi32__offset = 0 : index | |
util.global private mutable @__constant_tensor_2x2x2x2xi32__length = 64 : index | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant> | |
} | |
util.global.store %1#1, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %1#0, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.global.store %1#1, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.global private mutable @__constant_tensor_4x4xi32__storage_size = 128 : index | |
util.global private mutable @__constant_tensor_4x4xi32__offset = 64 : index | |
util.global private mutable @__constant_tensor_4x4xi32__length = 64 : index | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_4x4xi32__storage_size = util.global.load @__constant_tensor_4x4xi32__storage_size : index | |
%__constant_tensor_4x4xi32__offset = util.global.load @__constant_tensor_4x4xi32__offset : index | |
%__constant_tensor_2x2x2x2xi32__storage_size = util.global.load @__constant_tensor_2x2x2x2xi32__storage_size : index | |
%__constant_tensor_2x2x2x2xi32__offset = util.global.load @__constant_tensor_2x2x2x2xi32__offset : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%__constant_tensor_4x4xi32__storage_size}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__storage_size}, %result as %arg2: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%__constant_tensor_4x4xi32__offset for %3] : !stream.resource<constant>{%__constant_tensor_4x4xi32__storage_size}, | |
wo %arg2[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg1[%__constant_tensor_2x2x2x2xi32__offset], %arg2[%c0], %c64 : !stream.resource<constant>{%__constant_tensor_2x2x2x2xi32__storage_size} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg2[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant> | |
} | |
util.global.store %1#1, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %1#0, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.global.store %1#1, @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_4x4xi32 = util.global.load immutable @__constant_tensor_4x4xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_4x4xi32 as %arg0: !stream.resource<constant>{%c128}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c128}, %result as %arg2: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg2[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg2[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant> | |
} | |
util.global.store %1#1, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %1#0, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32_0 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_2x2x2x2xi32_0 as %arg0: !stream.resource<constant>{%c128}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c128}, %result as %arg2: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg2[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg2[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant> | |
} | |
util.global.store %1#1, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %1#0, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32_0 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_2x2x2x2xi32_0 as %arg0: !stream.resource<constant>{%c128}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c128}, %result as %arg2: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg2[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg2[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToCmdPass (iree-stream-verify-lowering-to-cmd) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant> | |
} | |
util.global.store %1#1, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %1#0, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32_0 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_2x2x2x2xi32_0 as %arg0: !stream.resource<constant>{%c128}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c128}, %result as %arg2: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg2[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg2[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant> | |
} | |
util.global.store %1#1, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %1#0, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant> | |
} | |
util.global.store %1#1, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %1#0, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%__constant_tensor_2x2x2x2xi32_0 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_2x2x2x2xi32_0 as %arg0: !stream.resource<constant>{%c128}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c128}, %result as %arg2: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg2[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg2[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant> | |
} | |
util.global.store %1#1, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %1#0, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c128}, %result as %arg2: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg2[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg2[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %__constant_tensor_2x2x2x2xi32 as %arg1: !stream.resource<constant>{%c128}, %result as %arg2: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg2[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg2[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant> | |
} | |
util.global.store %1#1, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %1#0, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant> | |
} | |
util.global.store %1#1, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %1#0, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant> | |
} | |
util.global.store %1#1, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %1#0, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant> | |
} | |
util.global.store %1#1, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %1#0, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- // | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
cf.br ^bb3(%0, %result : !stream.timepoint, !stream.resource<constant>) | |
^bb2: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb3(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb3(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb1, ^bb2 | |
cf.br ^bb4 | |
^bb4: // pred: ^bb3 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.join max(%__constant_tensor_2x2x2x2xi32__timepoint, %result_timepoint) => !stream.timepoint | |
%14 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%15 = stream.timepoint.await %14 => %result : !stream.resource<external>{%12} | |
%16 = stream.resource.subview %15[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%17 = stream.resource.subview %15[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%19, %18) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ElideTimepointsPass (iree-stream-elide-timepoints) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, iree.fixedpoint.modified, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.timepoint.immediate => !stream.timepoint | |
%14 = stream.timepoint.join max(%13, %result_timepoint) => !stream.timepoint | |
%15 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%14) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%16 = stream.timepoint.await %15 => %result : !stream.resource<external>{%12} | |
%17 = stream.resource.subview %16[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%18 = stream.resource.subview %16[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %17 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%20 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %18 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%20, %19) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%14 = stream.timepoint.await %13 => %result : !stream.resource<external>{%12} | |
%15 = stream.resource.subview %14[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%16 = stream.resource.subview %14[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%17 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%18, %17) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%14 = stream.timepoint.await %13 => %result : !stream.resource<external>{%12} | |
%15 = stream.resource.subview %14[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%16 = stream.resource.subview %14[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%17 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%18, %17) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%14 = stream.timepoint.await %13 => %result : !stream.resource<external>{%12} | |
%15 = stream.resource.subview %14[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%16 = stream.resource.subview %14[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%17 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%18, %17) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 1 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%14 = stream.timepoint.await %13 => %result : !stream.resource<external>{%12} | |
%15 = stream.resource.subview %14[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%16 = stream.resource.subview %14[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%17 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%18, %17) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 1 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%14 = stream.timepoint.await %13 => %result : !stream.resource<external>{%12} | |
%15 = stream.resource.subview %14[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%16 = stream.resource.subview %14[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%17 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%18, %17) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 1 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%14 = stream.timepoint.await %13 => %result : !stream.resource<external>{%12} | |
%15 = stream.resource.subview %14[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%16 = stream.resource.subview %14[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%17 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%18, %17) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 1 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%14 = stream.timepoint.await %13 => %result : !stream.resource<external>{%12} | |
%15 = stream.resource.subview %14[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%16 = stream.resource.subview %14[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%17 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%18, %17) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ElideTimepointsPass (iree-stream-elide-timepoints) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 1 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%14 = stream.timepoint.await %13 => %result : !stream.resource<external>{%12} | |
%15 = stream.resource.subview %14[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%16 = stream.resource.subview %14[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%17 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%18, %17) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg5, 4 : index | |
%5 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%13 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%0, %1, %5, %6, %4 : index, index, index, index, index) { | |
ro %arg0[%c64 for %3] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c64 for %10] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%14 = stream.timepoint.await %13 => %result : !stream.resource<external>{%12} | |
%15 = stream.resource.subview %14[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%16 = stream.resource.subview %14[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%17 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%18, %17) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseDispatchBindingsPass (iree-stream-fuse-dispatch-bindings) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg8, 4 : index | |
%5 = stream.binding.subspan %arg0[%arg2] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg1[%arg3] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%c0_0 = arith.constant 0 : index | |
%13 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%c64, %c64, %0, %1, %5, %6, %4 : index, index, index, index, index, index, index) { | |
ro %arg0[%c0_0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0_0 for %12] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%14 = stream.timepoint.await %13 => %result : !stream.resource<external>{%12} | |
%15 = stream.resource.subview %14[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%16 = stream.resource.subview %14[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%17 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%18, %17) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After AnnotateDispatchArgumentsPass (iree-stream-annotate-dispatch-arguments) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: index {stream.alignment = 64 : index, stream.values = [64 : index]}, %arg3: index {stream.alignment = 64 : index, stream.values = [64 : index]}, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg8, 4 : index | |
%5 = stream.binding.subspan %arg0[%arg2] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%6 = stream.binding.subspan %arg1[%arg3] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
%8 = tensor.empty(%2, %3, %4, %4) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [%4, %4] into %8 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %6, offsets = [0, 0, 0, 0], sizes = [%2, %3, %4, %4], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%2, %3, %4, %4} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%c0_0 = arith.constant 0 : index | |
%13 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%c64, %c64, %0, %1, %5, %6, %4 : index, index, index, index, index, index, index) { | |
ro %arg0[%c0_0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0_0 for %12] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%14 = stream.timepoint.await %13 => %result : !stream.resource<external>{%12} | |
%15 = stream.resource.subview %14[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%16 = stream.resource.subview %14[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%17 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %15 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%18 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %16 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%18, %17) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After PackDispatchOperandsPass (iree-stream-pack-dispatch-operands) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32) { | |
%0 = arith.extui %arg2 : i32 to i64 | |
%1 = arith.extui %arg3 : i32 to i64 | |
%c32_i64 = arith.constant 32 : i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.alignment = 64 : index, stream.values = [64 : index]} : i64 to index | |
%5 = arith.extui %arg4 : i32 to i64 | |
%6 = arith.extui %arg5 : i32 to i64 | |
%c32_i64_0 = arith.constant 32 : i64 | |
%7 = arith.shli %6, %c32_i64_0 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 {stream.alignment = 64 : index, stream.values = [64 : index]} : i64 to index | |
%10 = arith.extui %arg6 : i32 to i64 | |
%11 = arith.extui %arg7 : i32 to i64 | |
%c32_i64_1 = arith.constant 32 : i64 | |
%12 = arith.shli %11, %c32_i64_1 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 : i64 to index | |
%15 = arith.extui %arg8 : i32 to i64 | |
%16 = arith.extui %arg9 : i32 to i64 | |
%c32_i64_2 = arith.constant 32 : i64 | |
%17 = arith.shli %16, %c32_i64_2 : i64 | |
%18 = arith.ori %15, %17 : i64 | |
%19 = arith.index_castui %18 : i64 to index | |
%20 = arith.extui %arg10 : i32 to i64 | |
%21 = arith.extui %arg11 : i32 to i64 | |
%c32_i64_3 = arith.constant 32 : i64 | |
%22 = arith.shli %21, %c32_i64_3 : i64 | |
%23 = arith.ori %20, %22 : i64 | |
%24 = arith.index_castui %23 : i64 to index | |
%25 = arith.extui %arg12 : i32 to i64 | |
%26 = arith.extui %arg13 : i32 to i64 | |
%c32_i64_4 = arith.constant 32 : i64 | |
%27 = arith.shli %26, %c32_i64_4 : i64 | |
%28 = arith.ori %25, %27 : i64 | |
%29 = arith.index_castui %28 : i64 to index | |
%30 = arith.extui %arg14 : i32 to i64 | |
%31 = arith.extui %arg15 : i32 to i64 | |
%c32_i64_5 = arith.constant 32 : i64 | |
%32 = arith.shli %31, %c32_i64_5 : i64 | |
%33 = arith.ori %30, %32 : i64 | |
%34 = arith.index_castui %33 : i64 to index | |
%c0 = arith.constant 0 : index | |
%35 = flow.dispatch.workload.ordinal %14, 0 : index | |
%36 = flow.dispatch.workload.ordinal %19, 1 : index | |
%37 = flow.dispatch.workload.ordinal %24, 2 : index | |
%38 = flow.dispatch.workload.ordinal %29, 3 : index | |
%39 = flow.dispatch.workload.ordinal %34, 4 : index | |
%40 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%35, %36} | |
%41 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%37, %38, %39, %39} | |
%42 = flow.dispatch.tensor.load %40, offsets = [0, 0], sizes = [%35, %36], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%35, %36} -> tensor<?x?xi32> | |
%43 = tensor.empty(%37, %38, %39, %39) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %42 inner_dims_pos = [0, 1] inner_tiles = [%39, %39] into %43 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %41, offsets = [0, 0, 0, 0], sizes = [%37, %38, %39, %39], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%37, %38, %39, %39} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = arith.muli %0, %c4 : index | |
%3 = arith.muli %2, %1 : index | |
%4 = util.optimization_barrier %c2 : index | |
%5 = arith.ceildivui %0, %4 : index | |
%6 = arith.ceildivui %1, %4 : index | |
%7 = arith.muli %5, %c4 : index | |
%8 = arith.muli %7, %6 : index | |
%9 = arith.muli %8, %4 : index | |
%10 = arith.muli %9, %4 : index | |
%11 = util.align %10, %c64 : index | |
%12 = arith.addi %11, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%12} => !stream.timepoint | |
%c0_0 = arith.constant 0 : index | |
%c64_i64 = arith.constant 64 : i64 | |
%c64_i32 = arith.constant 64 : i32 | |
%c32_i64 = arith.constant 32 : i64 | |
%c0_i64 = arith.constant 0 : i64 | |
%c0_i32 = arith.constant 0 : i32 | |
%c64_i64_1 = arith.constant 64 : i64 | |
%c64_i32_2 = arith.constant 64 : i32 | |
%c32_i64_3 = arith.constant 32 : i64 | |
%c0_i64_4 = arith.constant 0 : i64 | |
%c0_i32_5 = arith.constant 0 : i32 | |
%13 = arith.index_castui %0 : index to i64 | |
%14 = arith.trunci %13 : i64 to i32 | |
%c32_i64_6 = arith.constant 32 : i64 | |
%15 = arith.shrui %13, %c32_i64_6 : i64 | |
%16 = arith.trunci %15 : i64 to i32 | |
%17 = arith.index_castui %1 : index to i64 | |
%18 = arith.trunci %17 : i64 to i32 | |
%c32_i64_7 = arith.constant 32 : i64 | |
%19 = arith.shrui %17, %c32_i64_7 : i64 | |
%20 = arith.trunci %19 : i64 to i32 | |
%21 = arith.index_castui %5 : index to i64 | |
%22 = arith.trunci %21 : i64 to i32 | |
%c32_i64_8 = arith.constant 32 : i64 | |
%23 = arith.shrui %21, %c32_i64_8 : i64 | |
%24 = arith.trunci %23 : i64 to i32 | |
%25 = arith.index_castui %6 : index to i64 | |
%26 = arith.trunci %25 : i64 to i32 | |
%c32_i64_9 = arith.constant 32 : i64 | |
%27 = arith.shrui %25, %c32_i64_9 : i64 | |
%28 = arith.trunci %27 : i64 to i32 | |
%29 = arith.index_castui %4 : index to i64 | |
%30 = arith.trunci %29 : i64 to i32 | |
%c32_i64_10 = arith.constant 32 : i64 | |
%31 = arith.shrui %29, %c32_i64_10 : i64 | |
%32 = arith.trunci %31 : i64 to i32 | |
%33 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%12}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %5, %6, %4](%c64_i32, %c0_i32, %c64_i32_2, %c0_i32_5, %14, %16, %18, %20, %22, %24, %26, %28, %30, %32 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0_0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0_0 for %12] : !stream.resource<external>{%12} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%12} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%12} | |
} | |
} => !stream.timepoint | |
%34 = stream.timepoint.await %33 => %result : !stream.resource<external>{%12} | |
%35 = stream.resource.subview %34[%c0] : !stream.resource<external>{%12} -> !stream.resource<external>{%c64} | |
%36 = stream.resource.subview %34[%c64] : !stream.resource<external>{%12} -> !stream.resource<external>{%10} | |
%37 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %35 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%38 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %36 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%38, %37) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0_i32 = arith.constant 0 : i32 | |
%c32_i64 = arith.constant 32 : i64 | |
%c64_i32 = arith.constant 64 : i32 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c2 : index | |
%3 = arith.ceildivui %0, %2 : index | |
%4 = arith.ceildivui %1, %2 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = arith.muli %6, %2 : index | |
%8 = arith.muli %7, %2 : index | |
%9 = util.align %8, %c64 : index | |
%10 = arith.addi %9, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%10} => !stream.timepoint | |
%11 = arith.index_castui %0 : index to i64 | |
%12 = arith.trunci %11 : i64 to i32 | |
%13 = arith.shrui %11, %c32_i64 : i64 | |
%14 = arith.trunci %13 : i64 to i32 | |
%15 = arith.index_castui %1 : index to i64 | |
%16 = arith.trunci %15 : i64 to i32 | |
%17 = arith.shrui %15, %c32_i64 : i64 | |
%18 = arith.trunci %17 : i64 to i32 | |
%19 = arith.index_castui %3 : index to i64 | |
%20 = arith.trunci %19 : i64 to i32 | |
%21 = arith.shrui %19, %c32_i64 : i64 | |
%22 = arith.trunci %21 : i64 to i32 | |
%23 = arith.index_castui %4 : index to i64 | |
%24 = arith.trunci %23 : i64 to i32 | |
%25 = arith.shrui %23, %c32_i64 : i64 | |
%26 = arith.trunci %25 : i64 to i32 | |
%27 = arith.index_castui %2 : index to i64 | |
%28 = arith.trunci %27 : i64 to i32 | |
%29 = arith.shrui %27, %c32_i64 : i64 | |
%30 = arith.trunci %29 : i64 to i32 | |
%31 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%10}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %3, %4, %2](%c64_i32, %c0_i32, %c64_i32, %c0_i32, %12, %14, %16, %18, %20, %22, %24, %26, %28, %30 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%10} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%10} | |
} | |
} => !stream.timepoint | |
%32 = stream.timepoint.await %31 => %result : !stream.resource<external>{%10} | |
%33 = stream.resource.subview %32[%c0] : !stream.resource<external>{%10} -> !stream.resource<external>{%c64} | |
%34 = stream.resource.subview %32[%c64] : !stream.resource<external>{%10} -> !stream.resource<external>{%8} | |
%35 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %33 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%36 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %34 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%36, %35) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c0_i32 = arith.constant 0 : i32 | |
%c32_i64 = arith.constant 32 : i64 | |
%c64_i32 = arith.constant 64 : i32 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c2 : index | |
%3 = arith.ceildivui %0, %2 : index | |
%4 = arith.ceildivui %1, %2 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = arith.muli %6, %2 : index | |
%8 = arith.muli %7, %2 : index | |
%9 = util.align %8, %c64 : index | |
%10 = arith.addi %9, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%10} => !stream.timepoint | |
%11 = arith.index_castui %0 : index to i64 | |
%12 = arith.trunci %11 : i64 to i32 | |
%13 = arith.shrui %11, %c32_i64 : i64 | |
%14 = arith.trunci %13 : i64 to i32 | |
%15 = arith.index_castui %1 : index to i64 | |
%16 = arith.trunci %15 : i64 to i32 | |
%17 = arith.shrui %15, %c32_i64 : i64 | |
%18 = arith.trunci %17 : i64 to i32 | |
%19 = arith.index_castui %3 : index to i64 | |
%20 = arith.trunci %19 : i64 to i32 | |
%21 = arith.shrui %19, %c32_i64 : i64 | |
%22 = arith.trunci %21 : i64 to i32 | |
%23 = arith.index_castui %4 : index to i64 | |
%24 = arith.trunci %23 : i64 to i32 | |
%25 = arith.shrui %23, %c32_i64 : i64 | |
%26 = arith.trunci %25 : i64 to i32 | |
%27 = arith.index_castui %2 : index to i64 | |
%28 = arith.trunci %27 : i64 to i32 | |
%29 = arith.shrui %27, %c32_i64 : i64 | |
%30 = arith.trunci %29 : i64 to i32 | |
%31 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%10}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %3, %4, %2](%c64_i32, %c0_i32, %c64_i32, %c0_i32, %12, %14, %16, %18, %20, %22, %24, %26, %28, %30 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%10} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%10} | |
} | |
} => !stream.timepoint | |
%32 = stream.timepoint.await %31 => %result : !stream.resource<external>{%10} | |
%33 = stream.resource.subview %32[%c0] : !stream.resource<external>{%10} -> !stream.resource<external>{%c64} | |
%34 = stream.resource.subview %32[%c64] : !stream.resource<external>{%10} -> !stream.resource<external>{%8} | |
%35 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %33 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%36 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %34 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%36, %35) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%c0_i32 = arith.constant 0 : i32 | |
%c32_i64 = arith.constant 32 : i64 | |
%c64_i32 = arith.constant 64 : i32 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c2 : index | |
%3 = arith.ceildivui %0, %2 : index | |
%4 = arith.ceildivui %1, %2 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = arith.muli %6, %2 : index | |
%8 = arith.muli %7, %2 : index | |
%9 = util.align %8, %c64 : index | |
%10 = arith.addi %9, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%10} => !stream.timepoint | |
%11 = arith.index_castui %0 : index to i64 | |
%12 = arith.trunci %11 : i64 to i32 | |
%13 = arith.shrui %11, %c32_i64 : i64 | |
%14 = arith.trunci %13 : i64 to i32 | |
%15 = arith.index_castui %1 : index to i64 | |
%16 = arith.trunci %15 : i64 to i32 | |
%17 = arith.shrui %15, %c32_i64 : i64 | |
%18 = arith.trunci %17 : i64 to i32 | |
%19 = arith.index_castui %3 : index to i64 | |
%20 = arith.trunci %19 : i64 to i32 | |
%21 = arith.shrui %19, %c32_i64 : i64 | |
%22 = arith.trunci %21 : i64 to i32 | |
%23 = arith.index_castui %4 : index to i64 | |
%24 = arith.trunci %23 : i64 to i32 | |
%25 = arith.shrui %23, %c32_i64 : i64 | |
%26 = arith.trunci %25 : i64 to i32 | |
%27 = arith.index_castui %2 : index to i64 | |
%28 = arith.trunci %27 : i64 to i32 | |
%29 = arith.shrui %27, %c32_i64 : i64 | |
%30 = arith.trunci %29 : i64 to i32 | |
%31 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%10}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %3, %4, %2](%c64_i32, %c0_i32, %c64_i32, %c0_i32, %12, %14, %16, %18, %20, %22, %24, %26, %28, %30 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%10} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%10} | |
} | |
} => !stream.timepoint | |
%32 = stream.timepoint.await %31 => %result : !stream.resource<external>{%10} | |
%33 = stream.resource.subview %32[%c0] : !stream.resource<external>{%10} -> !stream.resource<external>{%c64} | |
%34 = stream.resource.subview %32[%c64] : !stream.resource<external>{%10} -> !stream.resource<external>{%8} | |
%35 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %33 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%36 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %34 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%36, %35) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32) { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = arith.extui %arg2 : i32 to i64 | |
%1 = arith.extui %arg3 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.alignment = 64 : index, stream.values = [64 : index]} : i64 to index | |
%5 = arith.extui %arg4 : i32 to i64 | |
%6 = arith.extui %arg5 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 {stream.alignment = 64 : index, stream.values = [64 : index]} : i64 to index | |
%10 = arith.extui %arg6 : i32 to i64 | |
%11 = arith.extui %arg7 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 : i64 to index | |
%15 = arith.extui %arg8 : i32 to i64 | |
%16 = arith.extui %arg9 : i32 to i64 | |
%17 = arith.shli %16, %c32_i64 : i64 | |
%18 = arith.ori %15, %17 : i64 | |
%19 = arith.index_castui %18 : i64 to index | |
%20 = arith.extui %arg10 : i32 to i64 | |
%21 = arith.extui %arg11 : i32 to i64 | |
%22 = arith.shli %21, %c32_i64 : i64 | |
%23 = arith.ori %20, %22 : i64 | |
%24 = arith.index_castui %23 : i64 to index | |
%25 = arith.extui %arg12 : i32 to i64 | |
%26 = arith.extui %arg13 : i32 to i64 | |
%27 = arith.shli %26, %c32_i64 : i64 | |
%28 = arith.ori %25, %27 : i64 | |
%29 = arith.index_castui %28 : i64 to index | |
%30 = arith.extui %arg14 : i32 to i64 | |
%31 = arith.extui %arg15 : i32 to i64 | |
%32 = arith.shli %31, %c32_i64 : i64 | |
%33 = arith.ori %30, %32 : i64 | |
%34 = arith.index_castui %33 : i64 to index | |
%35 = flow.dispatch.workload.ordinal %14, 0 : index | |
%36 = flow.dispatch.workload.ordinal %19, 1 : index | |
%37 = flow.dispatch.workload.ordinal %24, 2 : index | |
%38 = flow.dispatch.workload.ordinal %29, 3 : index | |
%39 = flow.dispatch.workload.ordinal %34, 4 : index | |
%40 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%35, %36} | |
%41 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%37, %38, %39, %39} | |
%42 = flow.dispatch.tensor.load %40, offsets = [0, 0], sizes = [%35, %36], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%35, %36} -> tensor<?x?xi32> | |
%43 = tensor.empty(%37, %38, %39, %39) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %42 inner_dims_pos = [0, 1] inner_tiles = [%39, %39] into %43 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %41, offsets = [0, 0, 0, 0], sizes = [%37, %38, %39, %39], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%37, %38, %39, %39} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%c64_i32 = arith.constant 64 : i32 | |
%c32_i64 = arith.constant 32 : i64 | |
%c0_i32 = arith.constant 0 : i32 | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c2 : index | |
%3 = arith.ceildivui %0, %2 : index | |
%4 = arith.ceildivui %1, %2 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = arith.muli %6, %2 : index | |
%8 = arith.muli %7, %2 : index | |
%9 = util.align %8, %c64 : index | |
%10 = arith.addi %9, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%10} => !stream.timepoint | |
%11 = arith.index_castui %0 : index to i64 | |
%12 = arith.trunci %11 : i64 to i32 | |
%13 = arith.shrui %11, %c32_i64 : i64 | |
%14 = arith.trunci %13 : i64 to i32 | |
%15 = arith.index_castui %1 : index to i64 | |
%16 = arith.trunci %15 : i64 to i32 | |
%17 = arith.shrui %15, %c32_i64 : i64 | |
%18 = arith.trunci %17 : i64 to i32 | |
%19 = arith.index_castui %3 : index to i64 | |
%20 = arith.trunci %19 : i64 to i32 | |
%21 = arith.shrui %19, %c32_i64 : i64 | |
%22 = arith.trunci %21 : i64 to i32 | |
%23 = arith.index_castui %4 : index to i64 | |
%24 = arith.trunci %23 : i64 to i32 | |
%25 = arith.shrui %23, %c32_i64 : i64 | |
%26 = arith.trunci %25 : i64 to i32 | |
%27 = arith.index_castui %2 : index to i64 | |
%28 = arith.trunci %27 : i64 to i32 | |
%29 = arith.shrui %27, %c32_i64 : i64 | |
%30 = arith.trunci %29 : i64 to i32 | |
%31 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%10}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %3, %4, %2](%c64_i32, %c0_i32, %c64_i32, %c0_i32, %12, %14, %16, %18, %20, %22, %24, %26, %28, %30 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%10} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%10} | |
} | |
} => !stream.timepoint | |
%32 = stream.timepoint.await %31 => %result : !stream.resource<external>{%10} | |
%33 = stream.resource.subview %32[%c0] : !stream.resource<external>{%10} -> !stream.resource<external>{%c64} | |
%34 = stream.resource.subview %32[%c64] : !stream.resource<external>{%10} -> !stream.resource<external>{%8} | |
%35 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %33 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%36 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %34 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%36, %35) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32) { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = arith.extui %arg2 : i32 to i64 | |
%1 = arith.extui %arg3 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.alignment = 64 : index, stream.values = [64 : index]} : i64 to index | |
%5 = arith.extui %arg4 : i32 to i64 | |
%6 = arith.extui %arg5 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 {stream.alignment = 64 : index, stream.values = [64 : index]} : i64 to index | |
%10 = arith.extui %arg6 : i32 to i64 | |
%11 = arith.extui %arg7 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 : i64 to index | |
%15 = arith.extui %arg8 : i32 to i64 | |
%16 = arith.extui %arg9 : i32 to i64 | |
%17 = arith.shli %16, %c32_i64 : i64 | |
%18 = arith.ori %15, %17 : i64 | |
%19 = arith.index_castui %18 : i64 to index | |
%20 = arith.extui %arg10 : i32 to i64 | |
%21 = arith.extui %arg11 : i32 to i64 | |
%22 = arith.shli %21, %c32_i64 : i64 | |
%23 = arith.ori %20, %22 : i64 | |
%24 = arith.index_castui %23 : i64 to index | |
%25 = arith.extui %arg12 : i32 to i64 | |
%26 = arith.extui %arg13 : i32 to i64 | |
%27 = arith.shli %26, %c32_i64 : i64 | |
%28 = arith.ori %25, %27 : i64 | |
%29 = arith.index_castui %28 : i64 to index | |
%30 = arith.extui %arg14 : i32 to i64 | |
%31 = arith.extui %arg15 : i32 to i64 | |
%32 = arith.shli %31, %c32_i64 : i64 | |
%33 = arith.ori %30, %32 : i64 | |
%34 = arith.index_castui %33 : i64 to index | |
%35 = flow.dispatch.workload.ordinal %14, 0 : index | |
%36 = flow.dispatch.workload.ordinal %19, 1 : index | |
%37 = flow.dispatch.workload.ordinal %24, 2 : index | |
%38 = flow.dispatch.workload.ordinal %29, 3 : index | |
%39 = flow.dispatch.workload.ordinal %34, 4 : index | |
%40 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%35, %36} | |
%41 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%37, %38, %39, %39} | |
%42 = flow.dispatch.tensor.load %40, offsets = [0, 0], sizes = [%35, %36], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%35, %36} -> tensor<?x?xi32> | |
%43 = tensor.empty(%37, %38, %39, %39) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %42 inner_dims_pos = [0, 1] inner_tiles = [%39, %39] into %43 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %41, offsets = [0, 0, 0, 0], sizes = [%37, %38, %39, %39], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%37, %38, %39, %39} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%c64_i32 = arith.constant 64 : i32 | |
%c32_i64 = arith.constant 32 : i64 | |
%c0_i32 = arith.constant 0 : i32 | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c2 : index | |
%3 = arith.ceildivui %0, %2 : index | |
%4 = arith.ceildivui %1, %2 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = arith.muli %6, %2 : index | |
%8 = arith.muli %7, %2 : index | |
%9 = util.align %8, %c64 : index | |
%10 = arith.addi %9, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%10} => !stream.timepoint | |
%11 = arith.index_castui %0 : index to i64 | |
%12 = arith.trunci %11 : i64 to i32 | |
%13 = arith.shrui %11, %c32_i64 : i64 | |
%14 = arith.trunci %13 : i64 to i32 | |
%15 = arith.index_castui %1 : index to i64 | |
%16 = arith.trunci %15 : i64 to i32 | |
%17 = arith.shrui %15, %c32_i64 : i64 | |
%18 = arith.trunci %17 : i64 to i32 | |
%19 = arith.index_castui %3 : index to i64 | |
%20 = arith.trunci %19 : i64 to i32 | |
%21 = arith.shrui %19, %c32_i64 : i64 | |
%22 = arith.trunci %21 : i64 to i32 | |
%23 = arith.index_castui %4 : index to i64 | |
%24 = arith.trunci %23 : i64 to i32 | |
%25 = arith.shrui %23, %c32_i64 : i64 | |
%26 = arith.trunci %25 : i64 to i32 | |
%27 = arith.index_castui %2 : index to i64 | |
%28 = arith.trunci %27 : i64 to i32 | |
%29 = arith.shrui %27, %c32_i64 : i64 | |
%30 = arith.trunci %29 : i64 to i32 | |
%31 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%10}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %3, %4, %2](%c64_i32, %c0_i32, %c64_i32, %c0_i32, %12, %14, %16, %18, %20, %22, %24, %26, %28, %30 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%10} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%10} | |
} | |
} => !stream.timepoint | |
%32 = stream.timepoint.await %31 => %result : !stream.resource<external>{%10} | |
%33 = stream.resource.subview %32[%c0] : !stream.resource<external>{%10} -> !stream.resource<external>{%c64} | |
%34 = stream.resource.subview %32[%c64] : !stream.resource<external>{%10} -> !stream.resource<external>{%8} | |
%35 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %33 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%36 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %34 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%36, %35) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32) { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = arith.extui %arg2 : i32 to i64 | |
%1 = arith.extui %arg3 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.alignment = 64 : index, stream.values = [64 : index]} : i64 to index | |
%5 = arith.extui %arg4 : i32 to i64 | |
%6 = arith.extui %arg5 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 {stream.alignment = 64 : index, stream.values = [64 : index]} : i64 to index | |
%10 = arith.extui %arg6 : i32 to i64 | |
%11 = arith.extui %arg7 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 : i64 to index | |
%15 = arith.extui %arg8 : i32 to i64 | |
%16 = arith.extui %arg9 : i32 to i64 | |
%17 = arith.shli %16, %c32_i64 : i64 | |
%18 = arith.ori %15, %17 : i64 | |
%19 = arith.index_castui %18 : i64 to index | |
%20 = arith.extui %arg10 : i32 to i64 | |
%21 = arith.extui %arg11 : i32 to i64 | |
%22 = arith.shli %21, %c32_i64 : i64 | |
%23 = arith.ori %20, %22 : i64 | |
%24 = arith.index_castui %23 : i64 to index | |
%25 = arith.extui %arg12 : i32 to i64 | |
%26 = arith.extui %arg13 : i32 to i64 | |
%27 = arith.shli %26, %c32_i64 : i64 | |
%28 = arith.ori %25, %27 : i64 | |
%29 = arith.index_castui %28 : i64 to index | |
%30 = arith.extui %arg14 : i32 to i64 | |
%31 = arith.extui %arg15 : i32 to i64 | |
%32 = arith.shli %31, %c32_i64 : i64 | |
%33 = arith.ori %30, %32 : i64 | |
%34 = arith.index_castui %33 : i64 to index | |
%35 = flow.dispatch.workload.ordinal %14, 0 : index | |
%36 = flow.dispatch.workload.ordinal %19, 1 : index | |
%37 = flow.dispatch.workload.ordinal %24, 2 : index | |
%38 = flow.dispatch.workload.ordinal %29, 3 : index | |
%39 = flow.dispatch.workload.ordinal %34, 4 : index | |
%40 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%35, %36} | |
%41 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%37, %38, %39, %39} | |
%42 = flow.dispatch.tensor.load %40, offsets = [0, 0], sizes = [%35, %36], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%35, %36} -> tensor<?x?xi32> | |
%43 = tensor.empty(%37, %38, %39, %39) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %42 inner_dims_pos = [0, 1] inner_tiles = [%39, %39] into %43 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %41, offsets = [0, 0, 0, 0], sizes = [%37, %38, %39, %39], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%37, %38, %39, %39} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%c64_i32 = arith.constant 64 : i32 | |
%c32_i64 = arith.constant 32 : i64 | |
%c0_i32 = arith.constant 0 : i32 | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c2 : index | |
%3 = arith.ceildivui %0, %2 : index | |
%4 = arith.ceildivui %1, %2 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = arith.muli %6, %2 : index | |
%8 = arith.muli %7, %2 : index | |
%9 = util.align %8, %c64 : index | |
%10 = arith.addi %9, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%10} => !stream.timepoint | |
%11 = arith.index_castui %0 : index to i64 | |
%12 = arith.trunci %11 : i64 to i32 | |
%13 = arith.shrui %11, %c32_i64 : i64 | |
%14 = arith.trunci %13 : i64 to i32 | |
%15 = arith.index_castui %1 : index to i64 | |
%16 = arith.trunci %15 : i64 to i32 | |
%17 = arith.shrui %15, %c32_i64 : i64 | |
%18 = arith.trunci %17 : i64 to i32 | |
%19 = arith.index_castui %3 : index to i64 | |
%20 = arith.trunci %19 : i64 to i32 | |
%21 = arith.shrui %19, %c32_i64 : i64 | |
%22 = arith.trunci %21 : i64 to i32 | |
%23 = arith.index_castui %4 : index to i64 | |
%24 = arith.trunci %23 : i64 to i32 | |
%25 = arith.shrui %23, %c32_i64 : i64 | |
%26 = arith.trunci %25 : i64 to i32 | |
%27 = arith.index_castui %2 : index to i64 | |
%28 = arith.trunci %27 : i64 to i32 | |
%29 = arith.shrui %27, %c32_i64 : i64 | |
%30 = arith.trunci %29 : i64 to i32 | |
%31 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%10}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %3, %4, %2](%c64_i32, %c0_i32, %c64_i32, %c0_i32, %12, %14, %16, %18, %20, %22, %24, %26, %28, %30 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%10} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%10} | |
} | |
} => !stream.timepoint | |
%32 = stream.timepoint.await %31 => %result : !stream.resource<external>{%10} | |
%33 = stream.resource.subview %32[%c0] : !stream.resource<external>{%10} -> !stream.resource<external>{%c64} | |
%34 = stream.resource.subview %32[%c64] : !stream.resource<external>{%10} -> !stream.resource<external>{%8} | |
%35 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %33 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%36 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %34 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%36, %35) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32) { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = arith.extui %arg2 : i32 to i64 | |
%1 = arith.extui %arg3 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.alignment = 64 : index, stream.values = [64 : index]} : i64 to index | |
%5 = arith.extui %arg4 : i32 to i64 | |
%6 = arith.extui %arg5 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 {stream.alignment = 64 : index, stream.values = [64 : index]} : i64 to index | |
%10 = arith.extui %arg6 : i32 to i64 | |
%11 = arith.extui %arg7 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 : i64 to index | |
%15 = arith.extui %arg8 : i32 to i64 | |
%16 = arith.extui %arg9 : i32 to i64 | |
%17 = arith.shli %16, %c32_i64 : i64 | |
%18 = arith.ori %15, %17 : i64 | |
%19 = arith.index_castui %18 : i64 to index | |
%20 = arith.extui %arg10 : i32 to i64 | |
%21 = arith.extui %arg11 : i32 to i64 | |
%22 = arith.shli %21, %c32_i64 : i64 | |
%23 = arith.ori %20, %22 : i64 | |
%24 = arith.index_castui %23 : i64 to index | |
%25 = arith.extui %arg12 : i32 to i64 | |
%26 = arith.extui %arg13 : i32 to i64 | |
%27 = arith.shli %26, %c32_i64 : i64 | |
%28 = arith.ori %25, %27 : i64 | |
%29 = arith.index_castui %28 : i64 to index | |
%30 = arith.extui %arg14 : i32 to i64 | |
%31 = arith.extui %arg15 : i32 to i64 | |
%32 = arith.shli %31, %c32_i64 : i64 | |
%33 = arith.ori %30, %32 : i64 | |
%34 = arith.index_castui %33 : i64 to index | |
%35 = flow.dispatch.workload.ordinal %14, 0 : index | |
%36 = flow.dispatch.workload.ordinal %19, 1 : index | |
%37 = flow.dispatch.workload.ordinal %24, 2 : index | |
%38 = flow.dispatch.workload.ordinal %29, 3 : index | |
%39 = flow.dispatch.workload.ordinal %34, 4 : index | |
%40 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%35, %36} | |
%41 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%37, %38, %39, %39} | |
%42 = flow.dispatch.tensor.load %40, offsets = [0, 0], sizes = [%35, %36], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%35, %36} -> tensor<?x?xi32> | |
%43 = tensor.empty(%37, %38, %39, %39) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %42 inner_dims_pos = [0, 1] inner_tiles = [%39, %39] into %43 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %41, offsets = [0, 0, 0, 0], sizes = [%37, %38, %39, %39], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%37, %38, %39, %39} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%c64_i32 = arith.constant 64 : i32 | |
%c32_i64 = arith.constant 32 : i64 | |
%c0_i32 = arith.constant 0 : i32 | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c2 : index | |
%3 = arith.ceildivui %0, %2 : index | |
%4 = arith.ceildivui %1, %2 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = arith.muli %6, %2 : index | |
%8 = arith.muli %7, %2 : index | |
%9 = util.align %8, %c64 : index | |
%10 = arith.addi %9, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%10} => !stream.timepoint | |
%11 = arith.index_castui %0 : index to i64 | |
%12 = arith.trunci %11 : i64 to i32 | |
%13 = arith.shrui %11, %c32_i64 : i64 | |
%14 = arith.trunci %13 : i64 to i32 | |
%15 = arith.index_castui %1 : index to i64 | |
%16 = arith.trunci %15 : i64 to i32 | |
%17 = arith.shrui %15, %c32_i64 : i64 | |
%18 = arith.trunci %17 : i64 to i32 | |
%19 = arith.index_castui %3 : index to i64 | |
%20 = arith.trunci %19 : i64 to i32 | |
%21 = arith.shrui %19, %c32_i64 : i64 | |
%22 = arith.trunci %21 : i64 to i32 | |
%23 = arith.index_castui %4 : index to i64 | |
%24 = arith.trunci %23 : i64 to i32 | |
%25 = arith.shrui %23, %c32_i64 : i64 | |
%26 = arith.trunci %25 : i64 to i32 | |
%27 = arith.index_castui %2 : index to i64 | |
%28 = arith.trunci %27 : i64 to i32 | |
%29 = arith.shrui %27, %c32_i64 : i64 | |
%30 = arith.trunci %29 : i64 to i32 | |
%31 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%10}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %3, %4, %2](%c64_i32, %c0_i32, %c64_i32, %c0_i32, %12, %14, %16, %18, %20, %22, %24, %26, %28, %30 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%10} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%10} | |
} | |
} => !stream.timepoint | |
%32 = stream.timepoint.await %31 => %result : !stream.resource<external>{%10} | |
%33 = stream.resource.subview %32[%c0] : !stream.resource<external>{%10} -> !stream.resource<external>{%c64} | |
%34 = stream.resource.subview %32[%c64] : !stream.resource<external>{%10} -> !stream.resource<external>{%8} | |
%35 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %33 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%36 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %34 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%36, %35) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FoldUniformOperandsPass (iree-stream-fold-uniform-operands) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32) { | |
%c64_i32 = arith.constant 64 : i32 | |
%c0_i32 = arith.constant 0 : i32 | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = arith.extui %c64_i32 : i32 to i64 | |
%1 = arith.extui %c0_i32 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.alignment = 64 : index, stream.values = [64 : index]} : i64 to index | |
%5 = arith.extui %c64_i32 : i32 to i64 | |
%6 = arith.extui %c0_i32 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 {stream.alignment = 64 : index, stream.values = [64 : index]} : i64 to index | |
%10 = arith.extui %arg2 : i32 to i64 | |
%11 = arith.extui %arg3 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 : i64 to index | |
%15 = arith.extui %arg4 : i32 to i64 | |
%16 = arith.extui %arg5 : i32 to i64 | |
%17 = arith.shli %16, %c32_i64 : i64 | |
%18 = arith.ori %15, %17 : i64 | |
%19 = arith.index_castui %18 : i64 to index | |
%20 = arith.extui %arg6 : i32 to i64 | |
%21 = arith.extui %arg7 : i32 to i64 | |
%22 = arith.shli %21, %c32_i64 : i64 | |
%23 = arith.ori %20, %22 : i64 | |
%24 = arith.index_castui %23 : i64 to index | |
%25 = arith.extui %arg8 : i32 to i64 | |
%26 = arith.extui %arg9 : i32 to i64 | |
%27 = arith.shli %26, %c32_i64 : i64 | |
%28 = arith.ori %25, %27 : i64 | |
%29 = arith.index_castui %28 : i64 to index | |
%30 = arith.extui %arg10 : i32 to i64 | |
%31 = arith.extui %arg11 : i32 to i64 | |
%32 = arith.shli %31, %c32_i64 : i64 | |
%33 = arith.ori %30, %32 : i64 | |
%34 = arith.index_castui %33 : i64 to index | |
%35 = flow.dispatch.workload.ordinal %14, 0 : index | |
%36 = flow.dispatch.workload.ordinal %19, 1 : index | |
%37 = flow.dispatch.workload.ordinal %24, 2 : index | |
%38 = flow.dispatch.workload.ordinal %29, 3 : index | |
%39 = flow.dispatch.workload.ordinal %34, 4 : index | |
%40 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%35, %36} | |
%41 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%37, %38, %39, %39} | |
%42 = flow.dispatch.tensor.load %40, offsets = [0, 0], sizes = [%35, %36], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%35, %36} -> tensor<?x?xi32> | |
%43 = tensor.empty(%37, %38, %39, %39) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %42 inner_dims_pos = [0, 1] inner_tiles = [%39, %39] into %43 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %41, offsets = [0, 0, 0, 0], sizes = [%37, %38, %39, %39], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%37, %38, %39, %39} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%c64_i32 = arith.constant 64 : i32 | |
%c32_i64 = arith.constant 32 : i64 | |
%c0_i32 = arith.constant 0 : i32 | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c2 : index | |
%3 = arith.ceildivui %0, %2 : index | |
%4 = arith.ceildivui %1, %2 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = arith.muli %6, %2 : index | |
%8 = arith.muli %7, %2 : index | |
%9 = util.align %8, %c64 : index | |
%10 = arith.addi %9, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%10} => !stream.timepoint | |
%11 = arith.index_castui %0 : index to i64 | |
%12 = arith.trunci %11 : i64 to i32 | |
%13 = arith.shrui %11, %c32_i64 : i64 | |
%14 = arith.trunci %13 : i64 to i32 | |
%15 = arith.index_castui %1 : index to i64 | |
%16 = arith.trunci %15 : i64 to i32 | |
%17 = arith.shrui %15, %c32_i64 : i64 | |
%18 = arith.trunci %17 : i64 to i32 | |
%19 = arith.index_castui %3 : index to i64 | |
%20 = arith.trunci %19 : i64 to i32 | |
%21 = arith.shrui %19, %c32_i64 : i64 | |
%22 = arith.trunci %21 : i64 to i32 | |
%23 = arith.index_castui %4 : index to i64 | |
%24 = arith.trunci %23 : i64 to i32 | |
%25 = arith.shrui %23, %c32_i64 : i64 | |
%26 = arith.trunci %25 : i64 to i32 | |
%27 = arith.index_castui %2 : index to i64 | |
%28 = arith.trunci %27 : i64 to i32 | |
%29 = arith.shrui %27, %c32_i64 : i64 | |
%30 = arith.trunci %29 : i64 to i32 | |
%31 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%10}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %3, %4, %2](%12, %14, %16, %18, %20, %22, %24, %26, %28, %30 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%10} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%10} | |
} | |
} => !stream.timepoint | |
%32 = stream.timepoint.await %31 => %result : !stream.resource<external>{%10} | |
%33 = stream.resource.subview %32[%c0] : !stream.resource<external>{%10} -> !stream.resource<external>{%c64} | |
%34 = stream.resource.subview %32[%c64] : !stream.resource<external>{%10} -> !stream.resource<external>{%8} | |
%35 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %33 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%36 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %34 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%36, %35) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c2 : index | |
%3 = arith.ceildivui %0, %2 : index | |
%4 = arith.ceildivui %1, %2 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = arith.muli %6, %2 : index | |
%8 = arith.muli %7, %2 : index | |
%9 = util.align %8, %c64 : index | |
%10 = arith.addi %9, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%10} => !stream.timepoint | |
%11 = arith.index_castui %0 : index to i64 | |
%12 = arith.trunci %11 : i64 to i32 | |
%13 = arith.shrui %11, %c32_i64 : i64 | |
%14 = arith.trunci %13 : i64 to i32 | |
%15 = arith.index_castui %1 : index to i64 | |
%16 = arith.trunci %15 : i64 to i32 | |
%17 = arith.shrui %15, %c32_i64 : i64 | |
%18 = arith.trunci %17 : i64 to i32 | |
%19 = arith.index_castui %3 : index to i64 | |
%20 = arith.trunci %19 : i64 to i32 | |
%21 = arith.shrui %19, %c32_i64 : i64 | |
%22 = arith.trunci %21 : i64 to i32 | |
%23 = arith.index_castui %4 : index to i64 | |
%24 = arith.trunci %23 : i64 to i32 | |
%25 = arith.shrui %23, %c32_i64 : i64 | |
%26 = arith.trunci %25 : i64 to i32 | |
%27 = arith.index_castui %2 : index to i64 | |
%28 = arith.trunci %27 : i64 to i32 | |
%29 = arith.shrui %27, %c32_i64 : i64 | |
%30 = arith.trunci %29 : i64 to i32 | |
%31 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%10}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %3, %4, %2](%12, %14, %16, %18, %20, %22, %24, %26, %28, %30 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%10} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%10} | |
} | |
} => !stream.timepoint | |
%32 = stream.timepoint.await %31 => %result : !stream.resource<external>{%10} | |
%33 = stream.resource.subview %32[%c0] : !stream.resource<external>{%10} -> !stream.resource<external>{%c64} | |
%34 = stream.resource.subview %32[%c64] : !stream.resource<external>{%10} -> !stream.resource<external>{%8} | |
%35 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %33 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%36 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %34 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%36, %35) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c2 : index | |
%3 = arith.ceildivui %0, %2 : index | |
%4 = arith.ceildivui %1, %2 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = arith.muli %6, %2 : index | |
%8 = arith.muli %7, %2 : index | |
%9 = util.align %8, %c64 : index | |
%10 = arith.addi %9, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%10} => !stream.timepoint | |
%11 = arith.index_castui %0 : index to i64 | |
%12 = arith.trunci %11 : i64 to i32 | |
%13 = arith.shrui %11, %c32_i64 : i64 | |
%14 = arith.trunci %13 : i64 to i32 | |
%15 = arith.index_castui %1 : index to i64 | |
%16 = arith.trunci %15 : i64 to i32 | |
%17 = arith.shrui %15, %c32_i64 : i64 | |
%18 = arith.trunci %17 : i64 to i32 | |
%19 = arith.index_castui %3 : index to i64 | |
%20 = arith.trunci %19 : i64 to i32 | |
%21 = arith.shrui %19, %c32_i64 : i64 | |
%22 = arith.trunci %21 : i64 to i32 | |
%23 = arith.index_castui %4 : index to i64 | |
%24 = arith.trunci %23 : i64 to i32 | |
%25 = arith.shrui %23, %c32_i64 : i64 | |
%26 = arith.trunci %25 : i64 to i32 | |
%27 = arith.index_castui %2 : index to i64 | |
%28 = arith.trunci %27 : i64 to i32 | |
%29 = arith.shrui %27, %c32_i64 : i64 | |
%30 = arith.trunci %29 : i64 to i32 | |
%31 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%10}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %3, %4, %2](%12, %14, %16, %18, %20, %22, %24, %26, %28, %30 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%10} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%10} | |
} | |
} => !stream.timepoint | |
%32 = stream.timepoint.await %31 => %result : !stream.resource<external>{%10} | |
%33 = stream.resource.subview %32[%c0] : !stream.resource<external>{%10} -> !stream.resource<external>{%c64} | |
%34 = stream.resource.subview %32[%c64] : !stream.resource<external>{%10} -> !stream.resource<external>{%8} | |
%35 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %33 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%36 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %34 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%36, %35) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func private @_fully_dynamic_pack_simple() { | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c2 : index | |
%3 = arith.ceildivui %0, %2 : index | |
%4 = arith.ceildivui %1, %2 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = arith.muli %6, %2 : index | |
%8 = arith.muli %7, %2 : index | |
%9 = util.align %8, %c64 : index | |
%10 = arith.addi %9, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%10} => !stream.timepoint | |
%11 = arith.index_castui %0 : index to i64 | |
%12 = arith.trunci %11 : i64 to i32 | |
%13 = arith.shrui %11, %c32_i64 : i64 | |
%14 = arith.trunci %13 : i64 to i32 | |
%15 = arith.index_castui %1 : index to i64 | |
%16 = arith.trunci %15 : i64 to i32 | |
%17 = arith.shrui %15, %c32_i64 : i64 | |
%18 = arith.trunci %17 : i64 to i32 | |
%19 = arith.index_castui %3 : index to i64 | |
%20 = arith.trunci %19 : i64 to i32 | |
%21 = arith.shrui %19, %c32_i64 : i64 | |
%22 = arith.trunci %21 : i64 to i32 | |
%23 = arith.index_castui %4 : index to i64 | |
%24 = arith.trunci %23 : i64 to i32 | |
%25 = arith.shrui %23, %c32_i64 : i64 | |
%26 = arith.trunci %25 : i64 to i32 | |
%27 = arith.index_castui %2 : index to i64 | |
%28 = arith.trunci %27 : i64 to i32 | |
%29 = arith.shrui %27, %c32_i64 : i64 | |
%30 = arith.trunci %29 : i64 to i32 | |
%31 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%10}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %3, %4, %2](%12, %14, %16, %18, %20, %22, %24, %26, %28, %30 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%10} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%10} | |
} | |
} => !stream.timepoint | |
%32 = stream.timepoint.await %31 => %result : !stream.resource<external>{%10} | |
%33 = stream.resource.subview %32[%c0] : !stream.resource<external>{%10} -> !stream.resource<external>{%c64} | |
%34 = stream.resource.subview %32[%c64] : !stream.resource<external>{%10} -> !stream.resource<external>{%8} | |
%35 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %33 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%36 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %34 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%36, %35) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32) { | |
%c32_i64 = arith.constant 32 : i64 | |
%c64 = arith.constant 64 : index | |
%0 = arith.extui %arg2 : i32 to i64 | |
%1 = arith.extui %arg3 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 : i64 to index | |
%5 = arith.extui %arg4 : i32 to i64 | |
%6 = arith.extui %arg5 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 : i64 to index | |
%10 = arith.extui %arg6 : i32 to i64 | |
%11 = arith.extui %arg7 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 : i64 to index | |
%15 = arith.extui %arg8 : i32 to i64 | |
%16 = arith.extui %arg9 : i32 to i64 | |
%17 = arith.shli %16, %c32_i64 : i64 | |
%18 = arith.ori %15, %17 : i64 | |
%19 = arith.index_castui %18 : i64 to index | |
%20 = arith.extui %arg10 : i32 to i64 | |
%21 = arith.extui %arg11 : i32 to i64 | |
%22 = arith.shli %21, %c32_i64 : i64 | |
%23 = arith.ori %20, %22 : i64 | |
%24 = arith.index_castui %23 : i64 to index | |
%25 = flow.dispatch.workload.ordinal %4, 0 : index | |
%26 = flow.dispatch.workload.ordinal %9, 1 : index | |
%27 = flow.dispatch.workload.ordinal %14, 2 : index | |
%28 = flow.dispatch.workload.ordinal %19, 3 : index | |
%29 = flow.dispatch.workload.ordinal %24, 4 : index | |
%30 = stream.binding.subspan %arg0[%c64] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%25, %26} | |
%31 = stream.binding.subspan %arg1[%c64] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%27, %28, %29, %29} | |
%32 = flow.dispatch.tensor.load %30, offsets = [0, 0], sizes = [%25, %26], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%25, %26} -> tensor<?x?xi32> | |
%33 = tensor.empty(%27, %28, %29, %29) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %32 inner_dims_pos = [0, 1] inner_tiles = [%29, %29] into %33 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %31, offsets = [0, 0, 0, 0], sizes = [%27, %28, %29, %29], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%27, %28, %29, %29} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c32_i64 = arith.constant 32 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c2 : index | |
%3 = arith.ceildivui %0, %2 : index | |
%4 = arith.ceildivui %1, %2 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = arith.muli %6, %2 : index | |
%8 = arith.muli %7, %2 : index | |
%9 = util.align %8, %c64 : index | |
%10 = arith.addi %9, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%10} => !stream.timepoint | |
%11 = arith.index_castui %0 : index to i64 | |
%12 = arith.trunci %11 : i64 to i32 | |
%13 = arith.shrui %11, %c32_i64 : i64 | |
%14 = arith.trunci %13 : i64 to i32 | |
%15 = arith.index_castui %1 : index to i64 | |
%16 = arith.trunci %15 : i64 to i32 | |
%17 = arith.shrui %15, %c32_i64 : i64 | |
%18 = arith.trunci %17 : i64 to i32 | |
%19 = arith.index_castui %3 : index to i64 | |
%20 = arith.trunci %19 : i64 to i32 | |
%21 = arith.shrui %19, %c32_i64 : i64 | |
%22 = arith.trunci %21 : i64 to i32 | |
%23 = arith.index_castui %4 : index to i64 | |
%24 = arith.trunci %23 : i64 to i32 | |
%25 = arith.shrui %23, %c32_i64 : i64 | |
%26 = arith.trunci %25 : i64 to i32 | |
%27 = arith.index_castui %2 : index to i64 | |
%28 = arith.trunci %27 : i64 to i32 | |
%29 = arith.shrui %27, %c32_i64 : i64 | |
%30 = arith.trunci %29 : i64 to i32 | |
%31 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%10}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %3, %4, %2](%12, %14, %16, %18, %20, %22, %24, %26, %28, %30 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%10} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%10} | |
} | |
} => !stream.timepoint | |
%32 = stream.timepoint.await %31 => %result : !stream.resource<external>{%10} | |
%33 = stream.resource.subview %32[%c0] : !stream.resource<external>{%10} -> !stream.resource<external>{%c64} | |
%34 = stream.resource.subview %32[%c64] : !stream.resource<external>{%10} -> !stream.resource<external>{%8} | |
%35 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %33 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%36 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %34 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%36, %35) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32) { | |
%c32_i64 = arith.constant 32 : i64 | |
%c64 = arith.constant 64 : index | |
%0 = arith.extui %arg2 : i32 to i64 | |
%1 = arith.extui %arg3 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 : i64 to index | |
%5 = arith.extui %arg4 : i32 to i64 | |
%6 = arith.extui %arg5 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 : i64 to index | |
%10 = arith.extui %arg6 : i32 to i64 | |
%11 = arith.extui %arg7 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 : i64 to index | |
%15 = arith.extui %arg8 : i32 to i64 | |
%16 = arith.extui %arg9 : i32 to i64 | |
%17 = arith.shli %16, %c32_i64 : i64 | |
%18 = arith.ori %15, %17 : i64 | |
%19 = arith.index_castui %18 : i64 to index | |
%20 = arith.extui %arg10 : i32 to i64 | |
%21 = arith.extui %arg11 : i32 to i64 | |
%22 = arith.shli %21, %c32_i64 : i64 | |
%23 = arith.ori %20, %22 : i64 | |
%24 = arith.index_castui %23 : i64 to index | |
%25 = flow.dispatch.workload.ordinal %4, 0 : index | |
%26 = flow.dispatch.workload.ordinal %9, 1 : index | |
%27 = flow.dispatch.workload.ordinal %14, 2 : index | |
%28 = flow.dispatch.workload.ordinal %19, 3 : index | |
%29 = flow.dispatch.workload.ordinal %24, 4 : index | |
%30 = stream.binding.subspan %arg0[%c64] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%25, %26} | |
%31 = stream.binding.subspan %arg1[%c64] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%27, %28, %29, %29} | |
%32 = flow.dispatch.tensor.load %30, offsets = [0, 0], sizes = [%25, %26], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%25, %26} -> tensor<?x?xi32> | |
%33 = tensor.empty(%27, %28, %29, %29) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %32 inner_dims_pos = [0, 1] inner_tiles = [%29, %29] into %33 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %31, offsets = [0, 0, 0, 0], sizes = [%27, %28, %29, %29], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%27, %28, %29, %29} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c32_i64 = arith.constant 32 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c2 : index | |
%3 = arith.ceildivui %0, %2 : index | |
%4 = arith.ceildivui %1, %2 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = arith.muli %6, %2 : index | |
%8 = arith.muli %7, %2 : index | |
%9 = util.align %8, %c64 : index | |
%10 = arith.addi %9, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%10} => !stream.timepoint | |
%11 = arith.index_castui %0 : index to i64 | |
%12 = arith.trunci %11 : i64 to i32 | |
%13 = arith.shrui %11, %c32_i64 : i64 | |
%14 = arith.trunci %13 : i64 to i32 | |
%15 = arith.index_castui %1 : index to i64 | |
%16 = arith.trunci %15 : i64 to i32 | |
%17 = arith.shrui %15, %c32_i64 : i64 | |
%18 = arith.trunci %17 : i64 to i32 | |
%19 = arith.index_castui %3 : index to i64 | |
%20 = arith.trunci %19 : i64 to i32 | |
%21 = arith.shrui %19, %c32_i64 : i64 | |
%22 = arith.trunci %21 : i64 to i32 | |
%23 = arith.index_castui %4 : index to i64 | |
%24 = arith.trunci %23 : i64 to i32 | |
%25 = arith.shrui %23, %c32_i64 : i64 | |
%26 = arith.trunci %25 : i64 to i32 | |
%27 = arith.index_castui %2 : index to i64 | |
%28 = arith.trunci %27 : i64 to i32 | |
%29 = arith.shrui %27, %c32_i64 : i64 | |
%30 = arith.trunci %29 : i64 to i32 | |
%31 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%10}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %3, %4, %2](%12, %14, %16, %18, %20, %22, %24, %26, %28, %30 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%10} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%10} | |
} | |
} => !stream.timepoint | |
%32 = stream.timepoint.await %31 => %result : !stream.resource<external>{%10} | |
%33 = stream.resource.subview %32[%c0] : !stream.resource<external>{%10} -> !stream.resource<external>{%c64} | |
%34 = stream.resource.subview %32[%c64] : !stream.resource<external>{%10} -> !stream.resource<external>{%8} | |
%35 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %33 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%36 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %34 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%36, %35) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32) { | |
%c32_i64 = arith.constant 32 : i64 | |
%c64 = arith.constant 64 : index | |
%0 = arith.extui %arg2 : i32 to i64 | |
%1 = arith.extui %arg3 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 : i64 to index | |
%5 = arith.extui %arg4 : i32 to i64 | |
%6 = arith.extui %arg5 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 : i64 to index | |
%10 = arith.extui %arg6 : i32 to i64 | |
%11 = arith.extui %arg7 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 : i64 to index | |
%15 = arith.extui %arg8 : i32 to i64 | |
%16 = arith.extui %arg9 : i32 to i64 | |
%17 = arith.shli %16, %c32_i64 : i64 | |
%18 = arith.ori %15, %17 : i64 | |
%19 = arith.index_castui %18 : i64 to index | |
%20 = arith.extui %arg10 : i32 to i64 | |
%21 = arith.extui %arg11 : i32 to i64 | |
%22 = arith.shli %21, %c32_i64 : i64 | |
%23 = arith.ori %20, %22 : i64 | |
%24 = arith.index_castui %23 : i64 to index | |
%25 = flow.dispatch.workload.ordinal %4, 0 : index | |
%26 = flow.dispatch.workload.ordinal %9, 1 : index | |
%27 = flow.dispatch.workload.ordinal %14, 2 : index | |
%28 = flow.dispatch.workload.ordinal %19, 3 : index | |
%29 = flow.dispatch.workload.ordinal %24, 4 : index | |
%30 = stream.binding.subspan %arg0[%c64] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%25, %26} | |
%31 = stream.binding.subspan %arg1[%c64] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%27, %28, %29, %29} | |
%32 = flow.dispatch.tensor.load %30, offsets = [0, 0], sizes = [%25, %26], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%25, %26} -> tensor<?x?xi32> | |
%33 = tensor.empty(%27, %28, %29, %29) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %32 inner_dims_pos = [0, 1] inner_tiles = [%29, %29] into %33 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %31, offsets = [0, 0, 0, 0], sizes = [%27, %28, %29, %29], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%27, %28, %29, %29} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c32_i64 = arith.constant 32 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c2 : index | |
%3 = arith.ceildivui %0, %2 : index | |
%4 = arith.ceildivui %1, %2 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = arith.muli %6, %2 : index | |
%8 = arith.muli %7, %2 : index | |
%9 = util.align %8, %c64 : index | |
%10 = arith.addi %9, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%10} => !stream.timepoint | |
%11 = arith.index_castui %0 : index to i64 | |
%12 = arith.trunci %11 : i64 to i32 | |
%13 = arith.shrui %11, %c32_i64 : i64 | |
%14 = arith.trunci %13 : i64 to i32 | |
%15 = arith.index_castui %1 : index to i64 | |
%16 = arith.trunci %15 : i64 to i32 | |
%17 = arith.shrui %15, %c32_i64 : i64 | |
%18 = arith.trunci %17 : i64 to i32 | |
%19 = arith.index_castui %3 : index to i64 | |
%20 = arith.trunci %19 : i64 to i32 | |
%21 = arith.shrui %19, %c32_i64 : i64 | |
%22 = arith.trunci %21 : i64 to i32 | |
%23 = arith.index_castui %4 : index to i64 | |
%24 = arith.trunci %23 : i64 to i32 | |
%25 = arith.shrui %23, %c32_i64 : i64 | |
%26 = arith.trunci %25 : i64 to i32 | |
%27 = arith.index_castui %2 : index to i64 | |
%28 = arith.trunci %27 : i64 to i32 | |
%29 = arith.shrui %27, %c32_i64 : i64 | |
%30 = arith.trunci %29 : i64 to i32 | |
%31 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%10}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %3, %4, %2](%12, %14, %16, %18, %20, %22, %24, %26, %28, %30 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%10} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%10} | |
} | |
} => !stream.timepoint | |
%32 = stream.timepoint.await %31 => %result : !stream.resource<external>{%10} | |
%33 = stream.resource.subview %32[%c0] : !stream.resource<external>{%10} -> !stream.resource<external>{%c64} | |
%34 = stream.resource.subview %32[%c64] : !stream.resource<external>{%10} -> !stream.resource<external>{%8} | |
%35 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %33 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%36 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %34 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%36, %35) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32) { | |
%c32_i64 = arith.constant 32 : i64 | |
%c64 = arith.constant 64 : index | |
%0 = arith.extui %arg2 : i32 to i64 | |
%1 = arith.extui %arg3 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 : i64 to index | |
%5 = arith.extui %arg4 : i32 to i64 | |
%6 = arith.extui %arg5 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 : i64 to index | |
%10 = arith.extui %arg6 : i32 to i64 | |
%11 = arith.extui %arg7 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 : i64 to index | |
%15 = arith.extui %arg8 : i32 to i64 | |
%16 = arith.extui %arg9 : i32 to i64 | |
%17 = arith.shli %16, %c32_i64 : i64 | |
%18 = arith.ori %15, %17 : i64 | |
%19 = arith.index_castui %18 : i64 to index | |
%20 = arith.extui %arg10 : i32 to i64 | |
%21 = arith.extui %arg11 : i32 to i64 | |
%22 = arith.shli %21, %c32_i64 : i64 | |
%23 = arith.ori %20, %22 : i64 | |
%24 = arith.index_castui %23 : i64 to index | |
%25 = flow.dispatch.workload.ordinal %4, 0 : index | |
%26 = flow.dispatch.workload.ordinal %9, 1 : index | |
%27 = flow.dispatch.workload.ordinal %14, 2 : index | |
%28 = flow.dispatch.workload.ordinal %19, 3 : index | |
%29 = flow.dispatch.workload.ordinal %24, 4 : index | |
%30 = stream.binding.subspan %arg0[%c64] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%25, %26} | |
%31 = stream.binding.subspan %arg1[%c64] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%27, %28, %29, %29} | |
%32 = flow.dispatch.tensor.load %30, offsets = [0, 0], sizes = [%25, %26], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%25, %26} -> tensor<?x?xi32> | |
%33 = tensor.empty(%27, %28, %29, %29) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %32 inner_dims_pos = [0, 1] inner_tiles = [%29, %29] into %33 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %31, offsets = [0, 0, 0, 0], sizes = [%27, %28, %29, %29], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%27, %28, %29, %29} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c32_i64 = arith.constant 32 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c2 : index | |
%3 = arith.ceildivui %0, %2 : index | |
%4 = arith.ceildivui %1, %2 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = arith.muli %6, %2 : index | |
%8 = arith.muli %7, %2 : index | |
%9 = util.align %8, %c64 : index | |
%10 = arith.addi %9, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%10} => !stream.timepoint | |
%11 = arith.index_castui %0 : index to i64 | |
%12 = arith.trunci %11 : i64 to i32 | |
%13 = arith.shrui %11, %c32_i64 : i64 | |
%14 = arith.trunci %13 : i64 to i32 | |
%15 = arith.index_castui %1 : index to i64 | |
%16 = arith.trunci %15 : i64 to i32 | |
%17 = arith.shrui %15, %c32_i64 : i64 | |
%18 = arith.trunci %17 : i64 to i32 | |
%19 = arith.index_castui %3 : index to i64 | |
%20 = arith.trunci %19 : i64 to i32 | |
%21 = arith.shrui %19, %c32_i64 : i64 | |
%22 = arith.trunci %21 : i64 to i32 | |
%23 = arith.index_castui %4 : index to i64 | |
%24 = arith.trunci %23 : i64 to i32 | |
%25 = arith.shrui %23, %c32_i64 : i64 | |
%26 = arith.trunci %25 : i64 to i32 | |
%27 = arith.index_castui %2 : index to i64 | |
%28 = arith.trunci %27 : i64 to i32 | |
%29 = arith.shrui %27, %c32_i64 : i64 | |
%30 = arith.trunci %29 : i64 to i32 | |
%31 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%10}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %3, %4, %2](%12, %14, %16, %18, %20, %22, %24, %26, %28, %30 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%10} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%10} | |
} | |
} => !stream.timepoint | |
%32 = stream.timepoint.await %31 => %result : !stream.resource<external>{%10} | |
%33 = stream.resource.subview %32[%c0] : !stream.resource<external>{%10} -> !stream.resource<external>{%c64} | |
%34 = stream.resource.subview %32[%c64] : !stream.resource<external>{%10} -> !stream.resource<external>{%8} | |
%35 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %33 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%36 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %34 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%36, %35) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32) { | |
%c32_i64 = arith.constant 32 : i64 | |
%c64 = arith.constant 64 : index | |
%0 = arith.extui %arg2 : i32 to i64 | |
%1 = arith.extui %arg3 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 : i64 to index | |
%5 = arith.extui %arg4 : i32 to i64 | |
%6 = arith.extui %arg5 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 : i64 to index | |
%10 = arith.extui %arg6 : i32 to i64 | |
%11 = arith.extui %arg7 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 : i64 to index | |
%15 = arith.extui %arg8 : i32 to i64 | |
%16 = arith.extui %arg9 : i32 to i64 | |
%17 = arith.shli %16, %c32_i64 : i64 | |
%18 = arith.ori %15, %17 : i64 | |
%19 = arith.index_castui %18 : i64 to index | |
%20 = arith.extui %arg10 : i32 to i64 | |
%21 = arith.extui %arg11 : i32 to i64 | |
%22 = arith.shli %21, %c32_i64 : i64 | |
%23 = arith.ori %20, %22 : i64 | |
%24 = arith.index_castui %23 : i64 to index | |
%25 = flow.dispatch.workload.ordinal %4, 0 : index | |
%26 = flow.dispatch.workload.ordinal %9, 1 : index | |
%27 = flow.dispatch.workload.ordinal %14, 2 : index | |
%28 = flow.dispatch.workload.ordinal %19, 3 : index | |
%29 = flow.dispatch.workload.ordinal %24, 4 : index | |
%30 = stream.binding.subspan %arg0[%c64] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%25, %26} | |
%31 = stream.binding.subspan %arg1[%c64] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%27, %28, %29, %29} | |
%32 = flow.dispatch.tensor.load %30, offsets = [0, 0], sizes = [%25, %26], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%25, %26} -> tensor<?x?xi32> | |
%33 = tensor.empty(%27, %28, %29, %29) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %32 inner_dims_pos = [0, 1] inner_tiles = [%29, %29] into %33 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %31, offsets = [0, 0, 0, 0], sizes = [%27, %28, %29, %29], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%27, %28, %29, %29} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c32_i64 = arith.constant 32 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c2 : index | |
%3 = arith.ceildivui %0, %2 : index | |
%4 = arith.ceildivui %1, %2 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = arith.muli %6, %2 : index | |
%8 = arith.muli %7, %2 : index | |
%9 = util.align %8, %c64 : index | |
%10 = arith.addi %9, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%10} => !stream.timepoint | |
%11 = arith.index_castui %0 : index to i64 | |
%12 = arith.trunci %11 : i64 to i32 | |
%13 = arith.shrui %11, %c32_i64 : i64 | |
%14 = arith.trunci %13 : i64 to i32 | |
%15 = arith.index_castui %1 : index to i64 | |
%16 = arith.trunci %15 : i64 to i32 | |
%17 = arith.shrui %15, %c32_i64 : i64 | |
%18 = arith.trunci %17 : i64 to i32 | |
%19 = arith.index_castui %3 : index to i64 | |
%20 = arith.trunci %19 : i64 to i32 | |
%21 = arith.shrui %19, %c32_i64 : i64 | |
%22 = arith.trunci %21 : i64 to i32 | |
%23 = arith.index_castui %4 : index to i64 | |
%24 = arith.trunci %23 : i64 to i32 | |
%25 = arith.shrui %23, %c32_i64 : i64 | |
%26 = arith.trunci %25 : i64 to i32 | |
%27 = arith.index_castui %2 : index to i64 | |
%28 = arith.trunci %27 : i64 to i32 | |
%29 = arith.shrui %27, %c32_i64 : i64 | |
%30 = arith.trunci %29 : i64 to i32 | |
%31 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%10}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %3, %4, %2](%12, %14, %16, %18, %20, %22, %24, %26, %28, %30 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%10} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%10} | |
} | |
} => !stream.timepoint | |
%32 = stream.timepoint.await %31 => %result : !stream.resource<external>{%10} | |
%33 = stream.resource.subview %32[%c0] : !stream.resource<external>{%10} -> !stream.resource<external>{%c64} | |
%34 = stream.resource.subview %32[%c64] : !stream.resource<external>{%10} -> !stream.resource<external>{%8} | |
%35 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %33 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%36 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %34 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%36, %35) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32) { | |
%c32_i64 = arith.constant 32 : i64 | |
%c64 = arith.constant 64 : index | |
%0 = arith.extui %arg2 : i32 to i64 | |
%1 = arith.extui %arg3 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 : i64 to index | |
%5 = arith.extui %arg4 : i32 to i64 | |
%6 = arith.extui %arg5 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 : i64 to index | |
%10 = arith.extui %arg6 : i32 to i64 | |
%11 = arith.extui %arg7 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 : i64 to index | |
%15 = arith.extui %arg8 : i32 to i64 | |
%16 = arith.extui %arg9 : i32 to i64 | |
%17 = arith.shli %16, %c32_i64 : i64 | |
%18 = arith.ori %15, %17 : i64 | |
%19 = arith.index_castui %18 : i64 to index | |
%20 = arith.extui %arg10 : i32 to i64 | |
%21 = arith.extui %arg11 : i32 to i64 | |
%22 = arith.shli %21, %c32_i64 : i64 | |
%23 = arith.ori %20, %22 : i64 | |
%24 = arith.index_castui %23 : i64 to index | |
%25 = flow.dispatch.workload.ordinal %4, 0 : index | |
%26 = flow.dispatch.workload.ordinal %9, 1 : index | |
%27 = flow.dispatch.workload.ordinal %14, 2 : index | |
%28 = flow.dispatch.workload.ordinal %19, 3 : index | |
%29 = flow.dispatch.workload.ordinal %24, 4 : index | |
%30 = stream.binding.subspan %arg0[%c64] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%25, %26} | |
%31 = stream.binding.subspan %arg1[%c64] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%27, %28, %29, %29} | |
%32 = flow.dispatch.tensor.load %30, offsets = [0, 0], sizes = [%25, %26], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%25, %26} -> tensor<?x?xi32> | |
%33 = tensor.empty(%27, %28, %29, %29) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %32 inner_dims_pos = [0, 1] inner_tiles = [%29, %29] into %33 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %31, offsets = [0, 0, 0, 0], sizes = [%27, %28, %29, %29], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%27, %28, %29, %29} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c32_i64 = arith.constant 32 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c2 : index | |
%3 = arith.ceildivui %0, %2 : index | |
%4 = arith.ceildivui %1, %2 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = arith.muli %6, %2 : index | |
%8 = arith.muli %7, %2 : index | |
%9 = util.align %8, %c64 : index | |
%10 = arith.addi %9, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%10} => !stream.timepoint | |
%11 = arith.index_castui %0 : index to i64 | |
%12 = arith.trunci %11 : i64 to i32 | |
%13 = arith.shrui %11, %c32_i64 : i64 | |
%14 = arith.trunci %13 : i64 to i32 | |
%15 = arith.index_castui %1 : index to i64 | |
%16 = arith.trunci %15 : i64 to i32 | |
%17 = arith.shrui %15, %c32_i64 : i64 | |
%18 = arith.trunci %17 : i64 to i32 | |
%19 = arith.index_castui %3 : index to i64 | |
%20 = arith.trunci %19 : i64 to i32 | |
%21 = arith.shrui %19, %c32_i64 : i64 | |
%22 = arith.trunci %21 : i64 to i32 | |
%23 = arith.index_castui %4 : index to i64 | |
%24 = arith.trunci %23 : i64 to i32 | |
%25 = arith.shrui %23, %c32_i64 : i64 | |
%26 = arith.trunci %25 : i64 to i32 | |
%27 = arith.index_castui %2 : index to i64 | |
%28 = arith.trunci %27 : i64 to i32 | |
%29 = arith.shrui %27, %c32_i64 : i64 | |
%30 = arith.trunci %29 : i64 to i32 | |
%31 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%10}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %3, %4, %2](%12, %14, %16, %18, %20, %22, %24, %26, %28, %30 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%10} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%10} | |
} | |
} => !stream.timepoint | |
%32 = stream.timepoint.await %31 => %result : !stream.resource<external>{%10} | |
%33 = stream.resource.subview %32[%c0] : !stream.resource<external>{%10} -> !stream.resource<external>{%c64} | |
%34 = stream.resource.subview %32[%c64] : !stream.resource<external>{%10} -> !stream.resource<external>{%8} | |
%35 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %33 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%36 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %34 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%36, %35) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [12, 13]], [[10, 11], [14, 15]]]]> : tensor<2x2x2x2xi32>, | |
dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>, | |
]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @fully_dynamic_pack_simple() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @fully_dynamic_pack_simple() -> ()"}} { | |
util.call @_fully_dynamic_pack_simple() : () -> () | |
util.return | |
} | |
stream.executable private @_fully_dynamic_pack_simple_dispatch_0 { | |
stream.executable.export public @_fully_dynamic_pack_simple_dispatch_0_pack_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32) { | |
%c32_i64 = arith.constant 32 : i64 | |
%c64 = arith.constant 64 : index | |
%0 = arith.extui %arg2 : i32 to i64 | |
%1 = arith.extui %arg3 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 : i64 to index | |
%5 = arith.extui %arg4 : i32 to i64 | |
%6 = arith.extui %arg5 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 : i64 to index | |
%10 = arith.extui %arg6 : i32 to i64 | |
%11 = arith.extui %arg7 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 : i64 to index | |
%15 = arith.extui %arg8 : i32 to i64 | |
%16 = arith.extui %arg9 : i32 to i64 | |
%17 = arith.shli %16, %c32_i64 : i64 | |
%18 = arith.ori %15, %17 : i64 | |
%19 = arith.index_castui %18 : i64 to index | |
%20 = arith.extui %arg10 : i32 to i64 | |
%21 = arith.extui %arg11 : i32 to i64 | |
%22 = arith.shli %21, %c32_i64 : i64 | |
%23 = arith.ori %20, %22 : i64 | |
%24 = arith.index_castui %23 : i64 to index | |
%25 = flow.dispatch.workload.ordinal %4, 0 : index | |
%26 = flow.dispatch.workload.ordinal %9, 1 : index | |
%27 = flow.dispatch.workload.ordinal %14, 2 : index | |
%28 = flow.dispatch.workload.ordinal %19, 3 : index | |
%29 = flow.dispatch.workload.ordinal %24, 4 : index | |
%30 = stream.binding.subspan %arg0[%c64] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%25, %26} | |
%31 = stream.binding.subspan %arg1[%c64] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%27, %28, %29, %29} | |
%32 = flow.dispatch.tensor.load %30, offsets = [0, 0], sizes = [%25, %26], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%25, %26} -> tensor<?x?xi32> | |
%33 = tensor.empty(%27, %28, %29, %29) : tensor<?x?x?x?xi32> | |
%pack = tensor.pack %32 inner_dims_pos = [0, 1] inner_tiles = [%29, %29] into %33 : tensor<?x?xi32> -> tensor<?x?x?x?xi32> | |
flow.dispatch.tensor.store %pack, %31, offsets = [0, 0, 0, 0], sizes = [%27, %28, %29, %29], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%27, %28, %29, %29} | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_2x2x2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
util.global.store %4, @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
util.global.store %3, @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
util.func private @_fully_dynamic_pack_simple() { | |
%c32_i64 = arith.constant 32 : i64 | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_2x2x2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2x2x2xi32__timepoint : !stream.timepoint | |
%__constant_tensor_2x2x2x2xi32 = util.global.load immutable @__constant_tensor_2x2x2x2xi32 : !stream.resource<constant> | |
%0 = util.optimization_barrier %c4 : index | |
%1 = util.optimization_barrier %c4 : index | |
%2 = util.optimization_barrier %c2 : index | |
%3 = arith.ceildivui %0, %2 : index | |
%4 = arith.ceildivui %1, %2 : index | |
%5 = arith.muli %3, %c4 : index | |
%6 = arith.muli %5, %4 : index | |
%7 = arith.muli %6, %2 : index | |
%8 = arith.muli %7, %2 : index | |
%9 = util.align %8, %c64 : index | |
%10 = arith.addi %9, %c64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_2x2x2x2xi32__timepoint) => !stream.resource<external>{%10} => !stream.timepoint | |
%11 = arith.index_castui %0 : index to i64 | |
%12 = arith.trunci %11 : i64 to i32 | |
%13 = arith.shrui %11, %c32_i64 : i64 | |
%14 = arith.trunci %13 : i64 to i32 | |
%15 = arith.index_castui %1 : index to i64 | |
%16 = arith.trunci %15 : i64 to i32 | |
%17 = arith.shrui %15, %c32_i64 : i64 | |
%18 = arith.trunci %17 : i64 to i32 | |
%19 = arith.index_castui %3 : index to i64 | |
%20 = arith.trunci %19 : i64 to i32 | |
%21 = arith.shrui %19, %c32_i64 : i64 | |
%22 = arith.trunci %21 : i64 to i32 | |
%23 = arith.index_castui %4 : index to i64 | |
%24 = arith.trunci %23 : i64 to i32 | |
%25 = arith.shrui %23, %c32_i64 : i64 | |
%26 = arith.trunci %25 : i64 to i32 | |
%27 = arith.index_castui %2 : index to i64 | |
%28 = arith.trunci %27 : i64 to i32 | |
%29 = arith.shrui %27, %c32_i64 : i64 | |
%30 = arith.trunci %29 : i64 to i32 | |
%31 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_2x2x2x2xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%10}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_fully_dynamic_pack_simple_dispatch_0::@_fully_dynamic_pack_simple_dispatch_0_pack_i32[%0, %1, %3, %4, %2](%12, %14, %16, %18, %20, %22, %24, %26, %28, %30 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %c128] : !stream.resource<constant>{%c128}, | |
wo %arg1[%c0 for %10] : !stream.resource<external>{%10} | |
} | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%10} | |
stream.cmd.flush %arg1[%c0 for %c64] : !stream.resource<external>{%10} | |
} | |
} => !stream.timepoint | |
%32 = stream.timepoint.await %31 => %result : !stream.resource<external>{%10} | |
%33 = stream.resource.subview %32[%c0] : !stream.resource<external>{%10} -> !stream.resource<external>{%c64} | |
%34 = stream.resource.subview %32[%c64] : !stream.resource<external>{%10} -> !stream.resource<external>{%8} | |
%35 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %33 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
%36 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %34 : tensor<2x2x2x2xi32> in !stream.resource<external>{%c64} -> tensor<2x2x2x2xi32> | |
check.expect_eq(%36, %35) : tensor<2x2x2x2xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[[0, 1], [4, 5]], [[2, 3], [6, 7]]], [[[8, 9], [1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment