Skip to content

Instantly share code, notes, and snippets.

@pashu123
Created October 15, 2024 16:06
Show Gist options
  • Save pashu123/208085f0557dd9015be8b12af0a97615 to your computer and use it in GitHub Desktop.
Save pashu123/208085f0557dd9015be8b12af0a97615 to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
module {
func.func @scatter() -> tensor<2x2xi32> {
%0 = util.unfoldable_constant dense<0> : tensor<2x2xi32>
%1 = util.unfoldable_constant dense<1> : tensor<2xi32>
%2 = util.unfoldable_constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
return %3 : tensor<2x2xi32>
}
}
// -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- //
module {
util.func public @scatter() -> tensor<2x2xi32> {
%0 = util.unfoldable_constant dense<0> : tensor<2x2xi32>
%1 = util.unfoldable_constant dense<1> : tensor<2xi32>
%2 = util.unfoldable_constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
util.return %3 : tensor<2x2xi32>
}
}
// -----// IR Dump After ImportMLProgramPass (iree-import-ml-program) //----- //
module {
util.func public @scatter() -> tensor<2x2xi32> {
%0 = util.unfoldable_constant dense<0> : tensor<2x2xi32>
%1 = util.unfoldable_constant dense<1> : tensor<2xi32>
%2 = util.unfoldable_constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
util.return %3 : tensor<2x2xi32>
}
}
// -----// IR Dump After SanitizeModuleNamesPass (iree-sanitize-module-names) //----- //
module {
util.func public @scatter() -> tensor<2x2xi32> {
%0 = util.unfoldable_constant dense<0> : tensor<2x2xi32>
%1 = util.unfoldable_constant dense<1> : tensor<2xi32>
%2 = util.unfoldable_constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
util.return %3 : tensor<2x2xi32>
}
}
// -----// IR Dump After ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- //
module {
util.func public @scatter() -> tensor<2x2xi32> {
%0 = util.unfoldable_constant dense<0> : tensor<2x2xi32>
%1 = util.unfoldable_constant dense<1> : tensor<2xi32>
%2 = util.unfoldable_constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
util.return %3 : tensor<2x2xi32>
}
}
// -----// IR Dump After DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- //
module {
util.func public @scatter() -> tensor<2x2xi32> {
%0 = util.unfoldable_constant dense<0> : tensor<2x2xi32>
%1 = util.unfoldable_constant dense<1> : tensor<2xi32>
%2 = util.unfoldable_constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
util.return %3 : tensor<2x2xi32>
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- //
module {
util.func public @scatter() -> tensor<2x2xi32> {
%0 = util.unfoldable_constant dense<0> : tensor<2x2xi32>
%1 = util.unfoldable_constant dense<1> : tensor<2xi32>
%2 = util.unfoldable_constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
util.return %3 : tensor<2x2xi32>
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- //
module {
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%0 = util.call @_scatter() : () -> tensor<2x2xi32>
%1 = hal.tensor.export %0 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %1 : !hal.buffer_view
}
util.func private @_scatter() -> tensor<2x2xi32> {
%0 = util.unfoldable_constant dense<0> : tensor<2x2xi32>
%1 = util.unfoldable_constant dense<1> : tensor<2xi32>
%2 = util.unfoldable_constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
util.return %3 : tensor<2x2xi32>
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func private @_scatter() -> tensor<2x2xi32> {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
util.return %3 : tensor<2x2xi32>
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%0 = util.call @_scatter() : () -> tensor<2x2xi32>
%1 = hal.tensor.export %0 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After Inliner (inline) //----- //
module {
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
module {
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {hal.device.targets = [#device_target_local]} {
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After Convert1X1FilterConv2DToMatmulPass (iree-global-opt-convert-1x1-filter-conv2d-to-matmul) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After SetEncodingPass (iree-dispatch-creation-set-encoding) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CPUMaterializeHostEncodingPass (iree-codegen-cpu-materialize-host-encoding) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After MaterializeHomogeneousEncodingsPass (iree-global-opt-materialize-homogeneous-encodings) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After SimplifyPackUnpackPass (iree-global-opt-simplify-pack-unpack) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After DataLayoutPropagationPass (iree-global-opt-data-layout-propagation) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After HoistIntoGlobals (iree-util-hoist-into-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After JitGlobalsPass (iree-consteval-jit-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = flow.dispatch.region -> (tensor<2x2xi32>) {
%5 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
flow.return %5 : tensor<2x2xi32>
}
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = flow.dispatch.region -> (tensor<2x2xi32>) {
%5 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
flow.return %5 : tensor<2x2xi32>
}
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = flow.dispatch.region -> (tensor<2x2xi32>) {
%5 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%1, %2 : tensor<2xi32>, tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
flow.return %5 : tensor<2x2xi32>
}
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = flow.dispatch.workgroups(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0 =
(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%5 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%8 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%5, %6 : tensor<2xi32>, tensor<2x2xi32>) outs(%7 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
flow.return
}
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = flow.dispatch.workgroups(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0 =
(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%5 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%8 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%5, %6 : tensor<2xi32>, tensor<2x2xi32>) outs(%7 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
flow.return
}
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = flow.dispatch.workgroups(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0 =
(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%5 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%8 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%5, %6 : tensor<2xi32>, tensor<2x2xi32>) outs(%7 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
flow.return
}
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = flow.dispatch.workgroups(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0 =
(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%5 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%8 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%5, %6 : tensor<2xi32>, tensor<2x2xi32>) outs(%7 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After VerifyInputLegalityPass (iree-verify-input-legality) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = flow.dispatch.workgroups(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0 =
(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%5 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%8 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%5, %6 : tensor<2xi32>, tensor<2x2xi32>) outs(%7 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = flow.dispatch.workgroups(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0 =
(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%5 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%8 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%5, %6 : tensor<2xi32>, tensor<2x2xi32>) outs(%7 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = flow.dispatch.workgroups(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0 =
(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%5 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%8 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%5, %6 : tensor<2xi32>, tensor<2x2xi32>) outs(%7 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = flow.dispatch.workgroups(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0 =
(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%5 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%8 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%5, %6 : tensor<2xi32>, tensor<2x2xi32>) outs(%7 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = flow.dispatch.workgroups(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0 =
(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%5 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%8 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%5, %6 : tensor<2xi32>, tensor<2x2xi32>) outs(%7 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After OutlineDispatchExternsPass (iree-flow-outline-dispatch-externs) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = flow.dispatch.workgroups(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0 =
(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%5 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%8 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%5, %6 : tensor<2xi32>, tensor<2x2xi32>) outs(%7 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After OutlineDispatchRegionsPass (iree-flow-outline-dispatch-regions) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
flow.executable private @scatter_dispatch_0 {
flow.executable.export public @scatter_dispatch_0 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%0, %1 : tensor<2xi32>, tensor<2x2xi32>) outs(%2 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After AnnotateDispatchesPass (iree-flow-annotate-dispatches) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
flow.executable private @scatter_dispatch_0 {
flow.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%0, %1 : tensor<2xi32>, tensor<2x2xi32>) outs(%2 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- //
flow.executable private @scatter_dispatch_0 {
flow.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%0, %1 : tensor<2xi32>, tensor<2x2xi32>) outs(%2 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After DeduplicateExecutablesPass (iree-flow-deduplicate-executables) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
flow.executable private @scatter_dispatch_0 {
flow.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%0, %1 : tensor<2xi32>, tensor<2x2xi32>) outs(%2 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CleanupTensorShapesPass (iree-flow-cleanup-tensor-shapes) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%cst_1 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_1 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %cst : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After OutlineConstantsPass (iree-flow-outline-constants) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
flow.executable private @scatter_dispatch_0 {
flow.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%0, %1 : tensor<2xi32>, tensor<2x2xi32>) outs(%2 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : tensor<2x2xi32>
%cst = arith.constant dense<1> : tensor<2xi32>
%cst_0 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_0 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst : tensor<2xi32>
%2 = util.optimization_barrier %__constant_tensor_2x2xi32 : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<0> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : tensor<2x2xi32>
%0 = util.optimization_barrier %cst : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %__constant_tensor_2x2xi32 : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<0> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : tensor<2x2xi32>
%0 = util.optimization_barrier %cst : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %__constant_tensor_2x2xi32 : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<0> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : tensor<2x2xi32>
%0 = util.optimization_barrier %cst : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %__constant_tensor_2x2xi32 : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : tensor<2x2xi32>
%cst = arith.constant dense<0> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%0 = util.optimization_barrier %cst : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %__constant_tensor_2x2xi32 : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
flow.executable private @scatter_dispatch_0 {
flow.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%0, %1 : tensor<2xi32>, tensor<2x2xi32>) outs(%2 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<1> : tensor<2xi32>
%cst_0 = arith.constant dense<0> : tensor<2x2xi32>
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_0 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst : tensor<2xi32>
%2 = util.optimization_barrier %__constant_tensor_2x2xi32 : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
flow.executable private @scatter_dispatch_0 {
flow.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%0, %1 : tensor<2xi32>, tensor<2x2xi32>) outs(%2 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<1> : tensor<2xi32>
%cst_0 = arith.constant dense<0> : tensor<2x2xi32>
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_0 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst : tensor<2xi32>
%2 = util.optimization_barrier %__constant_tensor_2x2xi32 : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
flow.executable private @scatter_dispatch_0 {
flow.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%0, %1 : tensor<2xi32>, tensor<2x2xi32>) outs(%2 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<1> : tensor<2xi32>
%cst_0 = arith.constant dense<0> : tensor<2x2xi32>
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_0 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst : tensor<2xi32>
%2 = util.optimization_barrier %__constant_tensor_2x2xi32 : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
flow.executable private @scatter_dispatch_0 {
flow.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%0, %1 : tensor<2xi32>, tensor<2x2xi32>) outs(%2 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<1> : tensor<2xi32>
%cst_0 = arith.constant dense<0> : tensor<2x2xi32>
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_0 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst : tensor<2xi32>
%2 = util.optimization_barrier %__constant_tensor_2x2xi32 : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
flow.executable private @scatter_dispatch_0 {
flow.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%0, %1 : tensor<2xi32>, tensor<2x2xi32>) outs(%2 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<1> : tensor<2xi32>
%cst_0 = arith.constant dense<0> : tensor<2x2xi32>
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_0 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst : tensor<2xi32>
%2 = util.optimization_barrier %__constant_tensor_2x2xi32 : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
flow.executable private @scatter_dispatch_0 {
flow.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%0, %1 : tensor<2xi32>, tensor<2x2xi32>) outs(%2 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<1> : tensor<2xi32>
%cst_0 = arith.constant dense<0> : tensor<2x2xi32>
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_0 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst : tensor<2xi32>
%2 = util.optimization_barrier %__constant_tensor_2x2xi32 : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyInputPass (iree-stream-verify-input) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
flow.executable private @scatter_dispatch_0 {
flow.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%0, %1 : tensor<2xi32>, tensor<2x2xi32>) outs(%2 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<1> : tensor<2xi32>
%cst_0 = arith.constant dense<0> : tensor<2x2xi32>
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_0 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst : tensor<2xi32>
%2 = util.optimization_barrier %__constant_tensor_2x2xi32 : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<1> : tensor<2xi32>
%cst_0 = arith.constant dense<0> : tensor<2x2xi32>
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_0 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst : tensor<2xi32>
%2 = util.optimization_barrier %__constant_tensor_2x2xi32 : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<1> : tensor<2xi32>
%cst_0 = arith.constant dense<0> : tensor<2x2xi32>
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_0 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst : tensor<2xi32>
%2 = util.optimization_barrier %__constant_tensor_2x2xi32 : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : tensor<2x2xi32>
%cst = arith.constant dense<1> : tensor<2xi32>
%cst_0 = arith.constant dense<0> : tensor<2x2xi32>
%0 = util.optimization_barrier %cst_0 : tensor<2x2xi32>
%1 = util.optimization_barrier %cst : tensor<2xi32>
%2 = util.optimization_barrier %__constant_tensor_2x2xi32 : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
flow.executable private @scatter_dispatch_0 {
flow.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%0, %1 : tensor<2xi32>, tensor<2x2xi32>) outs(%2 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<0> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : tensor<2x2xi32>
%0 = util.optimization_barrier %cst : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %__constant_tensor_2x2xi32 : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
flow.executable private @scatter_dispatch_0 {
flow.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%0, %1 : tensor<2xi32>, tensor<2x2xi32>) outs(%2 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<0> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : tensor<2x2xi32>
%0 = util.optimization_barrier %cst : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %__constant_tensor_2x2xi32 : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
flow.executable private @scatter_dispatch_0 {
flow.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%0, %1 : tensor<2xi32>, tensor<2x2xi32>) outs(%2 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<0> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : tensor<2x2xi32>
%0 = util.optimization_barrier %cst : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %__constant_tensor_2x2xi32 : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
flow.executable private @scatter_dispatch_0 {
flow.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<2xi32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x2xi32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%3 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%0, %1 : tensor<2xi32>, tensor<2x2xi32>) outs(%2 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = arith.constant dense<0> : tensor<2x2xi32>
%cst_0 = arith.constant dense<1> : tensor<2xi32>
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : tensor<2x2xi32>
%0 = util.optimization_barrier %cst : tensor<2x2xi32>
%1 = util.optimization_barrier %cst_0 : tensor<2xi32>
%2 = util.optimization_barrier %__constant_tensor_2x2xi32 : tensor<2x2xi32>
%3 = flow.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%1, %2, %0) : (tensor<2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> %0
%4 = hal.tensor.export %3 "output0" : tensor<2x2xi32> -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After ConvertToStreamPass (iree-stream-conversion) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global private @__constant_tensor_2x2xi32__size : index
util.initializer {
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2xi32> in !stream.resource<constant> = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%0 = stream.resource.size %cst : !stream.resource<constant>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %0, @__constant_tensor_2x2xi32__size : index
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2xi32> in !stream.resource<constant> = dense<0> : tensor<2x2xi32>
%0 = stream.resource.size %cst : !stream.resource<constant>
%1 = stream.async.transfer %cst : !stream.resource<constant>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0}
%cst_0 = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2xi32> in !stream.resource<constant> = dense<1> : tensor<2xi32>
%2 = stream.resource.size %cst_0 : !stream.resource<constant>
%3 = stream.async.transfer %cst_0 : !stream.resource<constant>{%2} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%2}
%__constant_tensor_2x2xi32 = util.global.load @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__size = util.global.load @__constant_tensor_2x2xi32__size : index
%4 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_2x2xi32__size}
%5 = util.optimization_barrier %1 : !stream.resource<*>
%6 = util.optimization_barrier %3 : !stream.resource<*>
%7 = util.optimization_barrier %4 : !stream.resource<*>
%c0 = arith.constant 0 : index
%8 = stream.resource.size %6 : !stream.resource<*>
%9 = stream.resource.size %7 : !stream.resource<*>
%10 = stream.resource.size %5 : !stream.resource<*>
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%6[%c0 to %8 for %8], %7[%c0 to %9 for %9], %5[%c0 to %10 for %10]) : (!stream.resource<*>{%8}, !stream.resource<*>{%9}, !stream.resource<*>{%10}) -> %5{%10}
%12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2xi32> in !stream.resource<external>{%10} -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global private @__constant_tensor_2x2xi32__size : index
util.initializer {
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2xi32> in !stream.resource<constant> = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%0 = stream.resource.size %cst : !stream.resource<constant>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %0, @__constant_tensor_2x2xi32__size : index
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2xi32> in !stream.resource<constant> = dense<0> : tensor<2x2xi32>
%0 = stream.resource.size %cst : !stream.resource<constant>
%1 = stream.async.transfer %cst : !stream.resource<constant>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0}
%cst_0 = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2xi32> in !stream.resource<constant> = dense<1> : tensor<2xi32>
%2 = stream.resource.size %cst_0 : !stream.resource<constant>
%3 = stream.async.transfer %cst_0 : !stream.resource<constant>{%2} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%2}
%__constant_tensor_2x2xi32 = util.global.load @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__size = util.global.load @__constant_tensor_2x2xi32__size : index
%4 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_2x2xi32__size}
%5 = util.optimization_barrier %1 : !stream.resource<*>
%6 = util.optimization_barrier %3 : !stream.resource<*>
%7 = util.optimization_barrier %4 : !stream.resource<*>
%c0 = arith.constant 0 : index
%8 = stream.resource.size %6 : !stream.resource<*>
%9 = stream.resource.size %7 : !stream.resource<*>
%10 = stream.resource.size %5 : !stream.resource<*>
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%6[%c0 to %8 for %8], %7[%c0 to %9 for %9], %5[%c0 to %10 for %10]) : (!stream.resource<*>{%8}, !stream.resource<*>{%9}, !stream.resource<*>{%10}) -> %5{%10}
%12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2xi32> in !stream.resource<external>{%10} -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2xi32> in !stream.resource<constant> = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%0 = stream.resource.size %cst : !stream.resource<constant>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %0, @__constant_tensor_2x2xi32__size : index
util.return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2xi32> : index
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> tensor<2x2xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xi32> : index
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> tensor<2xi32> in !stream.resource<*>{%2}
%__constant_tensor_2x2xi32 = util.global.load @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__size = util.global.load @__constant_tensor_2x2xi32__size : index
%4 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_2x2xi32__size}
%5 = util.optimization_barrier %1 : !stream.resource<*>
%6 = util.optimization_barrier %3 : !stream.resource<*>
%7 = util.optimization_barrier %4 : !stream.resource<*>
%8 = stream.resource.size %6 : !stream.resource<*>
%9 = stream.resource.size %7 : !stream.resource<*>
%10 = stream.resource.size %5 : !stream.resource<*>
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%6[%c0 to %8 for %8], %7[%c0 to %9 for %9], %5[%c0 to %10 for %10]) : (!stream.resource<*>{%8}, !stream.resource<*>{%9}, !stream.resource<*>{%10}) -> %5{%10}
%12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2xi32> in !stream.resource<external>{%10} -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After Inliner (inline) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global private @__constant_tensor_2x2xi32__size : index
util.initializer {
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2xi32> in !stream.resource<constant> = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%0 = stream.resource.size %cst : !stream.resource<constant>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %0, @__constant_tensor_2x2xi32__size : index
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2xi32> : index
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> tensor<2x2xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xi32> : index
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> tensor<2xi32> in !stream.resource<*>{%2}
%__constant_tensor_2x2xi32 = util.global.load @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__size = util.global.load @__constant_tensor_2x2xi32__size : index
%4 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_2x2xi32__size}
%5 = util.optimization_barrier %1 : !stream.resource<*>
%6 = util.optimization_barrier %3 : !stream.resource<*>
%7 = util.optimization_barrier %4 : !stream.resource<*>
%8 = stream.resource.size %6 : !stream.resource<*>
%9 = stream.resource.size %7 : !stream.resource<*>
%10 = stream.resource.size %5 : !stream.resource<*>
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%6[%c0 to %8 for %8], %7[%c0 to %9 for %9], %5[%c0 to %10 for %10]) : (!stream.resource<*>{%8}, !stream.resource<*>{%9}, !stream.resource<*>{%10}) -> %5{%10}
%12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2xi32> in !stream.resource<external>{%10} -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2xi32> in !stream.resource<constant> = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%0 = stream.resource.size %cst : !stream.resource<constant>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %0, @__constant_tensor_2x2xi32__size : index
util.return
}
// -----// IR Dump After CSE (cse) //----- //
util.initializer {
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2xi32> in !stream.resource<constant> = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%0 = stream.resource.size %cst : !stream.resource<constant>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %0, @__constant_tensor_2x2xi32__size : index
util.return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2xi32> in !stream.resource<constant> = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%0 = stream.resource.size %cst : !stream.resource<constant>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %0, @__constant_tensor_2x2xi32__size : index
util.return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2xi32> : index
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> tensor<2x2xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xi32> : index
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> tensor<2xi32> in !stream.resource<*>{%2}
%__constant_tensor_2x2xi32 = util.global.load @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__size = util.global.load @__constant_tensor_2x2xi32__size : index
%4 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_2x2xi32__size}
%5 = util.optimization_barrier %1 : !stream.resource<*>
%6 = util.optimization_barrier %3 : !stream.resource<*>
%7 = util.optimization_barrier %4 : !stream.resource<*>
%8 = stream.resource.size %6 : !stream.resource<*>
%9 = stream.resource.size %7 : !stream.resource<*>
%10 = stream.resource.size %5 : !stream.resource<*>
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%6[%c0 to %8 for %8], %7[%c0 to %9 for %9], %5[%c0 to %10 for %10]) : (!stream.resource<*>{%8}, !stream.resource<*>{%9}, !stream.resource<*>{%10}) -> %5{%10}
%12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2xi32> in !stream.resource<external>{%10} -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2xi32> : index
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> tensor<2x2xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xi32> : index
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> tensor<2xi32> in !stream.resource<*>{%2}
%__constant_tensor_2x2xi32 = util.global.load @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__size = util.global.load @__constant_tensor_2x2xi32__size : index
%4 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_2x2xi32__size}
%5 = util.optimization_barrier %1 : !stream.resource<*>
%6 = util.optimization_barrier %3 : !stream.resource<*>
%7 = util.optimization_barrier %4 : !stream.resource<*>
%8 = stream.resource.size %6 : !stream.resource<*>
%9 = stream.resource.size %7 : !stream.resource<*>
%10 = stream.resource.size %5 : !stream.resource<*>
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%6[%c0 to %8 for %8], %7[%c0 to %9 for %9], %5[%c0 to %10 for %10]) : (!stream.resource<*>{%8}, !stream.resource<*>{%9}, !stream.resource<*>{%10}) -> %5{%10}
%12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2xi32> in !stream.resource<external>{%10} -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%__constant_tensor_2x2xi32 = util.global.load @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__size = util.global.load @__constant_tensor_2x2xi32__size : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2xi32> : index
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> tensor<2x2xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xi32> : index
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> tensor<2xi32> in !stream.resource<*>{%2}
%4 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_2x2xi32__size}
%5 = util.optimization_barrier %1 : !stream.resource<*>
%6 = util.optimization_barrier %3 : !stream.resource<*>
%7 = util.optimization_barrier %4 : !stream.resource<*>
%8 = stream.resource.size %6 : !stream.resource<*>
%9 = stream.resource.size %7 : !stream.resource<*>
%10 = stream.resource.size %5 : !stream.resource<*>
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%6[%c0 to %8 for %8], %7[%c0 to %9 for %9], %5[%c0 to %10 for %10]) : (!stream.resource<*>{%8}, !stream.resource<*>{%9}, !stream.resource<*>{%10}) -> %5{%10}
%12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2xi32> in !stream.resource<external>{%10} -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global private @__constant_tensor_2x2xi32__size : index
util.initializer {
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2xi32> in !stream.resource<constant> = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%0 = stream.resource.size %cst : !stream.resource<constant>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %0, @__constant_tensor_2x2xi32__size : index
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32 = util.global.load @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__size = util.global.load @__constant_tensor_2x2xi32__size : index
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2xi32> : index
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> tensor<2x2xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xi32> : index
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> tensor<2xi32> in !stream.resource<*>{%2}
%4 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_2x2xi32__size}
%5 = util.optimization_barrier %1 : !stream.resource<*>
%6 = util.optimization_barrier %3 : !stream.resource<*>
%7 = util.optimization_barrier %4 : !stream.resource<*>
%8 = stream.resource.size %6 : !stream.resource<*>
%9 = stream.resource.size %7 : !stream.resource<*>
%10 = stream.resource.size %5 : !stream.resource<*>
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%6[%c0 to %8 for %8], %7[%c0 to %9 for %9], %5[%c0 to %10 for %10]) : (!stream.resource<*>{%8}, !stream.resource<*>{%9}, !stream.resource<*>{%10}) -> %5{%10}
%12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2xi32> in !stream.resource<external>{%10} -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global private @__constant_tensor_2x2xi32__size : index
util.initializer {
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2xi32> in !stream.resource<constant> = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%0 = stream.resource.size %cst : !stream.resource<constant>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %0, @__constant_tensor_2x2xi32__size : index
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__size = util.global.load immutable @__constant_tensor_2x2xi32__size : index
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2xi32> : index
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> tensor<2x2xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xi32> : index
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> tensor<2xi32> in !stream.resource<*>{%2}
%4 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_2x2xi32__size}
%5 = util.optimization_barrier %1 : !stream.resource<*>
%6 = util.optimization_barrier %3 : !stream.resource<*>
%7 = util.optimization_barrier %4 : !stream.resource<*>
%8 = stream.resource.size %6 : !stream.resource<*>
%9 = stream.resource.size %7 : !stream.resource<*>
%10 = stream.resource.size %5 : !stream.resource<*>
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%6[%c0 to %8 for %8], %7[%c0 to %9 for %9], %5[%c0 to %10 for %10]) : (!stream.resource<*>{%8}, !stream.resource<*>{%9}, !stream.resource<*>{%10}) -> %5{%10}
%12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2xi32> in !stream.resource<external>{%10} -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global private @__constant_tensor_2x2xi32__size : index
util.initializer {
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2xi32> in !stream.resource<constant> = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%0 = stream.resource.size %cst : !stream.resource<constant>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %0, @__constant_tensor_2x2xi32__size : index
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__size = util.global.load immutable @__constant_tensor_2x2xi32__size : index
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2xi32> : index
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> tensor<2x2xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xi32> : index
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> tensor<2xi32> in !stream.resource<*>{%2}
%4 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_2x2xi32__size}
%5 = util.optimization_barrier %1 : !stream.resource<*>
%6 = util.optimization_barrier %3 : !stream.resource<*>
%7 = util.optimization_barrier %4 : !stream.resource<*>
%8 = stream.resource.size %6 : !stream.resource<*>
%9 = stream.resource.size %7 : !stream.resource<*>
%10 = stream.resource.size %5 : !stream.resource<*>
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%6[%c0 to %8 for %8], %7[%c0 to %9 for %9], %5[%c0 to %10 for %10]) : (!stream.resource<*>{%8}, !stream.resource<*>{%9}, !stream.resource<*>{%10}) -> %5{%10}
%12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2xi32> in !stream.resource<external>{%10} -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global private @__constant_tensor_2x2xi32__size : index
util.initializer {
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2xi32> in !stream.resource<constant> = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%0 = stream.resource.size %cst : !stream.resource<constant>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %0, @__constant_tensor_2x2xi32__size : index
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__size = util.global.load immutable @__constant_tensor_2x2xi32__size : index
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2xi32> : index
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> tensor<2x2xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xi32> : index
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> tensor<2xi32> in !stream.resource<*>{%2}
%4 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_2x2xi32__size}
%5 = util.optimization_barrier %1 : !stream.resource<*>
%6 = util.optimization_barrier %3 : !stream.resource<*>
%7 = util.optimization_barrier %4 : !stream.resource<*>
%8 = stream.resource.size %6 : !stream.resource<*>
%9 = stream.resource.size %7 : !stream.resource<*>
%10 = stream.resource.size %5 : !stream.resource<*>
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%6[%c0 to %8 for %8], %7[%c0 to %9 for %9], %5[%c0 to %10 for %10]) : (!stream.resource<*>{%8}, !stream.resource<*>{%9}, !stream.resource<*>{%10}) -> %5{%10}
%12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2xi32> in !stream.resource<external>{%10} -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
}
// -----// IR Dump After CombineInitializers (iree-util-combine-initializers) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global private @__constant_tensor_2x2xi32__size : index
util.initializer {
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2x2xi32> in !stream.resource<constant> = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
%0 = stream.resource.size %cst : !stream.resource<constant>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %0, @__constant_tensor_2x2xi32__size : index
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__size = util.global.load immutable @__constant_tensor_2x2xi32__size : index
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x2xi32> : index
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> tensor<2x2xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xi32> : index
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> tensor<2xi32> in !stream.resource<*>{%2}
%4 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_2x2xi32__size}
%5 = util.optimization_barrier %1 : !stream.resource<*>
%6 = util.optimization_barrier %3 : !stream.resource<*>
%7 = util.optimization_barrier %4 : !stream.resource<*>
%8 = stream.resource.size %6 : !stream.resource<*>
%9 = stream.resource.size %7 : !stream.resource<*>
%10 = stream.resource.size %5 : !stream.resource<*>
%11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%6[%c0 to %8 for %8], %7[%c0 to %9 for %9], %5[%c0 to %10 for %10]) : (!stream.resource<*>{%8}, !stream.resource<*>{%9}, !stream.resource<*>{%10}) -> %5{%10}
%12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
%13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x2xi32> in !stream.resource<external>{%10} -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
}
// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- //
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
// -----// IR Dump After EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- //
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %c16, @__constant_tensor_2x2xi32__size : index
util.return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %c16, @__constant_tensor_2x2xi32__size : index
util.return
}
// -----// IR Dump After CSE (cse) //----- //
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %c16, @__constant_tensor_2x2xi32__size : index
util.return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %c16, @__constant_tensor_2x2xi32__size : index
util.return
}
// -----// IR Dump After EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__size = util.global.load immutable @__constant_tensor_2x2xi32__size : index
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<*>{%c16}
%1 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<*>{%c8}
%2 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_2x2xi32__size}
%3 = util.optimization_barrier %0 : !stream.resource<*>
%4 = util.optimization_barrier %1 : !stream.resource<*>
%5 = util.optimization_barrier %2 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.resource.size %3 : !stream.resource<*>
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7], %3[%c0 to %8 for %8]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}, !stream.resource<*>{%8}) -> %3{%8}
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%8} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__size = util.global.load immutable @__constant_tensor_2x2xi32__size : index
%0 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_2x2xi32__size}
%1 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<*>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<*>
%3 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<*>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %0 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.resource.size %2 : !stream.resource<*>
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7], %2[%c0 to %8 for %8]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}, !stream.resource<*>{%8}) -> %2{%8}
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%8} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__size = util.global.load immutable @__constant_tensor_2x2xi32__size : index
%0 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_2x2xi32__size}
%1 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<*>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<*>
%3 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<*>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %0 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.resource.size %2 : !stream.resource<*>
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7], %2[%c0 to %8 for %8]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}, !stream.resource<*>{%8}) -> %2{%8}
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%8} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__size = util.global.load immutable @__constant_tensor_2x2xi32__size : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%0 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_2x2xi32__size}
%1 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<*>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<*>
%3 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<*>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %0 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.resource.size %2 : !stream.resource<*>
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7], %2[%c0 to %8 for %8]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}, !stream.resource<*>{%8}) -> %2{%8}
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%8} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global private @__constant_tensor_2x2xi32__size = 16 : index
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__size = util.global.load immutable @__constant_tensor_2x2xi32__size : index
%0 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_2x2xi32__size}
%1 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<*>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<*>
%3 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<*>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %0 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.resource.size %2 : !stream.resource<*>
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7], %2[%c0 to %8 for %8]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}, !stream.resource<*>{%8}) -> %2{%8}
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%8} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c16} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c16}
%1 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<*>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<*>
%3 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<*>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %0 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.resource.size %2 : !stream.resource<*>
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7], %2[%c0 to %8 for %8]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}, !stream.resource<*>{%8}) -> %2{%8}
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%8} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c16} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c16}
%1 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<*>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<*>
%3 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<*>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %0 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.resource.size %2 : !stream.resource<*>
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7], %2[%c0 to %8 for %8]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}, !stream.resource<*>{%8}) -> %2{%8}
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%8} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c16} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c16}
%1 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<*>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<*>
%3 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<*>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %0 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.resource.size %2 : !stream.resource<*>
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7], %2[%c0 to %8 for %8]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}, !stream.resource<*>{%8}) -> %2{%8}
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%8} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyLoweringToAsyncResourcesPass (iree-stream-verify-lowering-to-async-resources) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c16} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c16}
%1 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<*>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<*>
%3 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<*>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %0 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.resource.size %2 : !stream.resource<*>
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7], %2[%c0 to %8 for %8]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}, !stream.resource<*>{%8}) -> %2{%8}
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%8} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
}
// -----// IR Dump After MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- //
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
// -----// IR Dump After MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c16} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c16}
%1 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<*>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<*>
%3 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<*>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %0 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.resource.size %2 : !stream.resource<*>
%9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<*>{%8} -> !stream.resource<*>{%8}
%10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7], %9[%c0 to %8 for %8]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}, !stream.resource<*>{%8}) -> %9{%8}
%11 = stream.async.transfer %10 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
%12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x2xi32> in !stream.resource<external>{%8} -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c16} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c16}
%1 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<*>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<*>
%3 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<*>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %0 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.resource.size %2 : !stream.resource<*>
%9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<*>{%8} -> !stream.resource<*>{%8}
%10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7], %9[%c0 to %8 for %8]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}, !stream.resource<*>{%8}) -> %9{%8}
%11 = stream.async.transfer %10 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
%12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x2xi32> in !stream.resource<external>{%8} -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c16} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c16}
%1 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<*>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<*>
%3 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<*>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %0 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.resource.size %2 : !stream.resource<*>
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7], %2[%c0 to %8 for %8]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}, !stream.resource<*>{%8}) -> %2{%8}
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%8} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
// -----// IR Dump After EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- //
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c16} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c16}
%1 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<*>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<*>
%3 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<*>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %0 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.resource.size %2 : !stream.resource<*>
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7], %2[%c0 to %8 for %8]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}, !stream.resource<*>{%8}) -> %2{%8}
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%8} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
// -----// IR Dump After EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.async.transfer %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c16} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c16}
%1 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<*>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<*>
%3 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<*>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %0 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.resource.size %2 : !stream.resource<*>
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7], %2[%c0 to %8 for %8]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}, !stream.resource<*>{%8}) -> %2{%8}
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%8} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
// -----// IR Dump After RefineUsagePass (iree-stream-refine-usage) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<external>{%c16}
%1 = util.optimization_barrier %0 : !stream.resource<external>
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<transient>{%c8}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %__constant_tensor_2x2xi32 : !stream.resource<constant>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<constant>
%7 = stream.resource.size %1 : !stream.resource<external>
%8 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%3[%c0 to %5 for %5], %4[%c0 to %6 for %6], %1[%c0 to %7 for %7]) : (!stream.resource<transient>{%5}, !stream.resource<constant>{%6}, !stream.resource<external>{%7}) -> %1{%7}
%9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2x2xi32> in !stream.resource<external>{%7} -> !hal.buffer_view
util.return %9 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
// -----// IR Dump After CSE (cse) //----- //
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<external>{%c16}
%1 = util.optimization_barrier %0 : !stream.resource<external>
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<transient>{%c8}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %__constant_tensor_2x2xi32 : !stream.resource<constant>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<constant>
%7 = stream.resource.size %1 : !stream.resource<external>
%8 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%3[%c0 to %5 for %5], %4[%c0 to %6 for %6], %1[%c0 to %7 for %7]) : (!stream.resource<transient>{%5}, !stream.resource<constant>{%6}, !stream.resource<external>{%7}) -> %1{%7}
%9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2x2xi32> in !stream.resource<external>{%7} -> !hal.buffer_view
util.return %9 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<external>{%c16}
%1 = util.optimization_barrier %0 : !stream.resource<external>
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<transient>{%c8}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %__constant_tensor_2x2xi32 : !stream.resource<constant>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<constant>
%7 = stream.resource.size %1 : !stream.resource<external>
%8 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%3[%c0 to %5 for %5], %4[%c0 to %6 for %6], %1[%c0 to %7 for %7]) : (!stream.resource<transient>{%5}, !stream.resource<constant>{%6}, !stream.resource<external>{%7}) -> %1{%7}
%9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2x2xi32> in !stream.resource<external>{%7} -> !hal.buffer_view
util.return %9 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<external>{%c16}
%1 = util.optimization_barrier %0 : !stream.resource<external>
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<transient>{%c8}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %__constant_tensor_2x2xi32 : !stream.resource<constant>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<constant>
%7 = stream.resource.size %1 : !stream.resource<external>
%8 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%3[%c0 to %5 for %5], %4[%c0 to %6 for %6], %1[%c0 to %7 for %7]) : (!stream.resource<transient>{%5}, !stream.resource<constant>{%6}, !stream.resource<external>{%7}) -> %1{%7}
%9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2x2xi32> in !stream.resource<external>{%7} -> !hal.buffer_view
util.return %9 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<external>{%c16}
%1 = util.optimization_barrier %0 : !stream.resource<external>
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<transient>{%c8}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %__constant_tensor_2x2xi32 : !stream.resource<constant>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<constant>
%7 = stream.resource.size %1 : !stream.resource<external>
%8 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%3[%c0 to %5 for %5], %4[%c0 to %6 for %6], %1[%c0 to %7 for %7]) : (!stream.resource<transient>{%5}, !stream.resource<constant>{%6}, !stream.resource<external>{%7}) -> %1{%7}
%9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2x2xi32> in !stream.resource<external>{%7} -> !hal.buffer_view
util.return %9 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<external>{%c16}
%1 = util.optimization_barrier %0 : !stream.resource<external>
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<transient>{%c8}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %__constant_tensor_2x2xi32 : !stream.resource<constant>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<constant>
%7 = stream.resource.size %1 : !stream.resource<external>
%8 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%3[%c0 to %5 for %5], %4[%c0 to %6 for %6], %1[%c0 to %7 for %7]) : (!stream.resource<transient>{%5}, !stream.resource<constant>{%6}, !stream.resource<external>{%7}) -> %1{%7}
%9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2x2xi32> in !stream.resource<external>{%7} -> !hal.buffer_view
util.return %9 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<external>{%c16}
%1 = util.optimization_barrier %0 : !stream.resource<external>
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<transient>{%c8}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %__constant_tensor_2x2xi32 : !stream.resource<constant>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<constant>
%7 = stream.resource.size %1 : !stream.resource<external>
%8 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%3[%c0 to %5 for %5], %4[%c0 to %6 for %6], %1[%c0 to %7 for %7]) : (!stream.resource<transient>{%5}, !stream.resource<constant>{%6}, !stream.resource<external>{%7}) -> %1{%7}
%9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2x2xi32> in !stream.resource<external>{%7} -> !hal.buffer_view
util.return %9 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<external>{%c16}
%1 = util.optimization_barrier %0 : !stream.resource<external>
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<transient>{%c8}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %__constant_tensor_2x2xi32 : !stream.resource<constant>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<constant>
%7 = stream.resource.size %1 : !stream.resource<external>
%8 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%3[%c0 to %5 for %5], %4[%c0 to %6 for %6], %1[%c0 to %7 for %7]) : (!stream.resource<transient>{%5}, !stream.resource<constant>{%6}, !stream.resource<external>{%7}) -> %1{%7}
%9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2x2xi32> in !stream.resource<external>{%7} -> !hal.buffer_view
util.return %9 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyAsyncAccessRangesPass (iree-stream-verify-async-access-ranges) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c16 = arith.constant 16 : index
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
util.global.store %cst, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c0_i8 : i8 -> !stream.resource<external>{%c16}
%1 = util.optimization_barrier %0 : !stream.resource<external>
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1_i32 : i32 -> !stream.resource<transient>{%c8}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %__constant_tensor_2x2xi32 : !stream.resource<constant>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<constant>
%7 = stream.resource.size %1 : !stream.resource<external>
%8 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%3[%c0 to %5 for %5], %4[%c0 to %6 for %6], %1[%c0 to %7 for %7]) : (!stream.resource<transient>{%5}, !stream.resource<constant>{%6}, !stream.resource<external>{%7}) -> %1{%7}
%9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2x2xi32> in !stream.resource<external>{%7} -> !hal.buffer_view
util.return %9 : !hal.buffer_view
}
}
// -----// IR Dump After ScheduleExecutionPass (iree-stream-schedule-execution) //----- //
util.initializer {
%c16 = arith.constant 16 : index
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<constant>{%c16} {
%cst = stream.async.constant : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
stream.yield %cst : !stream.resource<constant>{%c16}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<constant>{%c16}
util.global.store %0, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
// -----// IR Dump After ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- //
util.initializer {
%c16 = arith.constant 16 : index
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<constant>{%c16} {
%cst = stream.async.constant : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
stream.yield %cst : !stream.resource<constant>{%c16}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<constant>{%c16}
util.global.store %0, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
// -----// IR Dump After ScheduleExecutionPass (iree-stream-schedule-execution) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<external>{%c16} {
%10 = stream.async.splat %c0_i8 : i8 -> !stream.resource<external>{%c16}
stream.yield %10 : !stream.resource<external>{%c16}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%1 = util.optimization_barrier %0 : !stream.resource<external>
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8} {
%10 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c8}
stream.yield %10 : !stream.resource<transient>{%c8}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c8}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %__constant_tensor_2x2xi32 : !stream.resource<constant>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<constant>
%7 = stream.resource.size %1 : !stream.resource<external>
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<constant>{%6}, %1 as %arg2: !stream.resource<external>{%7}) -> %1{%7} {
%10 = stream.async.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0[%c0 to %5 for %5], %arg1[%c0 to %6 for %6], %arg2[%c0 to %7 for %7]) : (!stream.resource<transient>{%5}, !stream.resource<constant>{%6}, !stream.resource<external>{%7}) -> %arg2{%7}
stream.yield %10 : !stream.resource<external>{%7}
} => !stream.timepoint
%8 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%7}
%9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2x2xi32> in !stream.resource<external>{%7} -> !hal.buffer_view
util.return %9 : !hal.buffer_view
}
// -----// IR Dump After ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<external>{%c16} {
%10 = stream.async.splat %c0_i8 : i8 -> !stream.resource<external>{%c16}
stream.yield %10 : !stream.resource<external>{%c16}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%1 = util.optimization_barrier %0 : !stream.resource<external>
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8} {
%10 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c8}
stream.yield %10 : !stream.resource<transient>{%c8}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c8}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %__constant_tensor_2x2xi32 : !stream.resource<constant>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<constant>
%7 = stream.resource.size %1 : !stream.resource<external>
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<constant>{%6}, %1 as %arg2: !stream.resource<external>{%7}) -> %1{%7} {
%10 = stream.async.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0[%c0 to %5 for %5], %arg1[%c0 to %6 for %6], %arg2[%c0 to %7 for %7]) : (!stream.resource<transient>{%5}, !stream.resource<constant>{%6}, !stream.resource<external>{%7}) -> %arg2{%7}
stream.yield %10 : !stream.resource<external>{%7}
} => !stream.timepoint
%8 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%7}
%9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2x2xi32> in !stream.resource<external>{%7} -> !hal.buffer_view
util.return %9 : !hal.buffer_view
}
// -----// IR Dump After PropagateTimepointsPass (iree-stream-propagate-timepoints) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private mutable @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c16 = arith.constant 16 : index
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<constant>{%c16} {
%cst = stream.async.constant : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
stream.yield %cst : !stream.resource<constant>{%c16}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<constant>{%c16}
util.global.store %result_timepoint, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.global.store %results, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32__timepoint = util.global.load @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.resource.size %__constant_tensor_2x2xi32 : !stream.resource<constant>
%1 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%0}
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<external>{%c16} {
%16 = stream.async.splat %c0_i8 : i8 -> !stream.resource<external>{%c16}
stream.yield %16 : !stream.resource<external>{%c16}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%3 = util.optimization_barrier %2 : !stream.resource<external>
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8} {
%16 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c8}
stream.yield %16 : !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = util.optimization_barrier %1 : !stream.resource<constant>
%7 = stream.resource.size %5 : !stream.resource<transient>
%8 = stream.resource.size %6 : !stream.resource<constant>
%9 = stream.resource.size %3 : !stream.resource<external>
%10 = stream.timepoint.immediate => !stream.timepoint
%11 = stream.timepoint.immediate => !stream.timepoint
%12 = stream.timepoint.immediate => !stream.timepoint
%13 = stream.timepoint.join max(%10, %11, %12) => !stream.timepoint
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%5 as %arg0: !stream.resource<transient>{%7}, %6 as %arg1: !stream.resource<constant>{%8}, %3 as %arg2: !stream.resource<external>{%9}) -> %3{%9} {
%16 = stream.async.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0[%c0 to %7 for %7], %arg1[%c0 to %8 for %8], %arg2[%c0 to %9 for %9]) : (!stream.resource<transient>{%7}, !stream.resource<constant>{%8}, !stream.resource<external>{%9}) -> %arg2{%9}
stream.yield %16 : !stream.resource<external>{%9}
} => !stream.timepoint
%14 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%9}
%15 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %14 : tensor<2x2xi32> in !stream.resource<external>{%9} -> !hal.buffer_view
util.return %15 : !hal.buffer_view
}
}
// -----// IR Dump After MaterializeBuiltinsPass (iree-stream-materialize-builtins) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private mutable @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c16 = arith.constant 16 : index
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<constant>{%c16} {
%cst = stream.async.constant : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
stream.yield %cst : !stream.resource<constant>{%c16}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<constant>{%c16}
util.global.store %result_timepoint, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.global.store %results, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32__timepoint = util.global.load @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.resource.size %__constant_tensor_2x2xi32 : !stream.resource<constant>
%1 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%0}
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<external>{%c16} {
%16 = stream.async.splat %c0_i8 : i8 -> !stream.resource<external>{%c16}
stream.yield %16 : !stream.resource<external>{%c16}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%3 = util.optimization_barrier %2 : !stream.resource<external>
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8} {
%16 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c8}
stream.yield %16 : !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = util.optimization_barrier %1 : !stream.resource<constant>
%7 = stream.resource.size %5 : !stream.resource<transient>
%8 = stream.resource.size %6 : !stream.resource<constant>
%9 = stream.resource.size %3 : !stream.resource<external>
%10 = stream.timepoint.immediate => !stream.timepoint
%11 = stream.timepoint.immediate => !stream.timepoint
%12 = stream.timepoint.immediate => !stream.timepoint
%13 = stream.timepoint.join max(%10, %11, %12) => !stream.timepoint
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%13) => with(%5 as %arg0: !stream.resource<transient>{%7}, %6 as %arg1: !stream.resource<constant>{%8}, %3 as %arg2: !stream.resource<external>{%9}) -> %3{%9} {
%16 = stream.async.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0[%c0 to %7 for %7], %arg1[%c0 to %8 for %8], %arg2[%c0 to %9 for %9]) : (!stream.resource<transient>{%7}, !stream.resource<constant>{%8}, !stream.resource<external>{%9}) -> %arg2{%9}
stream.yield %16 : !stream.resource<external>{%9}
} => !stream.timepoint
%14 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%9}
%15 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %14 : tensor<2x2xi32> in !stream.resource<external>{%9} -> !hal.buffer_view
util.return %15 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
%c16 = arith.constant 16 : index
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<constant>{%c16} {
%cst = stream.async.constant : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
stream.yield %cst : !stream.resource<constant>{%c16}
} => !stream.timepoint
util.global.store %result_timepoint, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.global.store %results, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
// -----// IR Dump After CSE (cse) //----- //
util.initializer {
%c16 = arith.constant 16 : index
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<constant>{%c16} {
%cst = stream.async.constant : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
stream.yield %cst : !stream.resource<constant>{%c16}
} => !stream.timepoint
util.global.store %result_timepoint, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.global.store %results, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
%c16 = arith.constant 16 : index
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<constant>{%c16} {
%cst = stream.async.constant : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
stream.yield %cst : !stream.resource<constant>{%c16}
} => !stream.timepoint
util.global.store %results, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %result_timepoint, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32__timepoint = util.global.load @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.resource.size %__constant_tensor_2x2xi32 : !stream.resource<constant>
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<external>{%c16} {
%12 = stream.async.splat %c0_i8 : i8 -> !stream.resource<external>{%c16}
stream.yield %12 : !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8} {
%12 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c8}
stream.yield %12 : !stream.resource<transient>{%c8}
} => !stream.timepoint
%3 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<transient>
%5 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%0}
%6 = util.optimization_barrier %5 : !stream.resource<constant>
%7 = stream.resource.size %4 : !stream.resource<transient>
%8 = stream.resource.size %6 : !stream.resource<constant>
%9 = stream.resource.size %2 : !stream.resource<external>
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%4 as %arg0: !stream.resource<transient>{%7}, %6 as %arg1: !stream.resource<constant>{%8}, %2 as %arg2: !stream.resource<external>{%9}) -> %2{%9} {
%12 = stream.async.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0[%c0 to %7 for %7], %arg1[%c0 to %8 for %8], %arg2[%c0 to %9 for %9]) : (!stream.resource<transient>{%7}, !stream.resource<constant>{%8}, !stream.resource<external>{%9}) -> %arg2{%9}
stream.yield %12 : !stream.resource<external>{%9}
} => !stream.timepoint
%10 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%9}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%9} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32__timepoint = util.global.load @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.resource.size %__constant_tensor_2x2xi32 : !stream.resource<constant>
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<external>{%c16} {
%12 = stream.async.splat %c0_i8 : i8 -> !stream.resource<external>{%c16}
stream.yield %12 : !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8} {
%12 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c8}
stream.yield %12 : !stream.resource<transient>{%c8}
} => !stream.timepoint
%3 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<transient>
%5 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%0}
%6 = util.optimization_barrier %5 : !stream.resource<constant>
%7 = stream.resource.size %4 : !stream.resource<transient>
%8 = stream.resource.size %6 : !stream.resource<constant>
%9 = stream.resource.size %2 : !stream.resource<external>
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%4 as %arg0: !stream.resource<transient>{%7}, %6 as %arg1: !stream.resource<constant>{%8}, %2 as %arg2: !stream.resource<external>{%9}) -> %2{%9} {
%12 = stream.async.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0[%c0 to %7 for %7], %arg1[%c0 to %8 for %8], %arg2[%c0 to %9 for %9]) : (!stream.resource<transient>{%7}, !stream.resource<constant>{%8}, !stream.resource<external>{%9}) -> %arg2{%9}
stream.yield %12 : !stream.resource<external>{%9}
} => !stream.timepoint
%10 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%9}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%9} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%__constant_tensor_2x2xi32__timepoint = util.global.load @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%0 = stream.resource.size %__constant_tensor_2x2xi32 : !stream.resource<constant>
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<external>{%c16} {
%12 = stream.async.splat %c0_i8 : i8 -> !stream.resource<external>{%c16}
stream.yield %12 : !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8} {
%12 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c8}
stream.yield %12 : !stream.resource<transient>{%c8}
} => !stream.timepoint
%3 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<transient>
%5 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%0}
%6 = util.optimization_barrier %5 : !stream.resource<constant>
%7 = stream.resource.size %4 : !stream.resource<transient>
%8 = stream.resource.size %6 : !stream.resource<constant>
%9 = stream.resource.size %2 : !stream.resource<external>
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%4 as %arg0: !stream.resource<transient>{%7}, %6 as %arg1: !stream.resource<constant>{%8}, %2 as %arg2: !stream.resource<external>{%9}) -> %2{%9} {
%12 = stream.async.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0[%c0 to %7 for %7], %arg1[%c0 to %8 for %8], %arg2[%c0 to %9 for %9]) : (!stream.resource<transient>{%7}, !stream.resource<constant>{%8}, !stream.resource<external>{%9}) -> %arg2{%9}
stream.yield %12 : !stream.resource<external>{%9}
} => !stream.timepoint
%10 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%9}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%9} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private mutable @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c16 = arith.constant 16 : index
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<constant>{%c16} {
%cst = stream.async.constant : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
stream.yield %cst : !stream.resource<constant>{%c16}
} => !stream.timepoint
util.global.store %results, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %result_timepoint, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.resource.size %__constant_tensor_2x2xi32 : !stream.resource<constant>
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<external>{%c16} {
%12 = stream.async.splat %c0_i8 : i8 -> !stream.resource<external>{%c16}
stream.yield %12 : !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8} {
%12 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c8}
stream.yield %12 : !stream.resource<transient>{%c8}
} => !stream.timepoint
%3 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<transient>
%5 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%0}
%6 = util.optimization_barrier %5 : !stream.resource<constant>
%7 = stream.resource.size %4 : !stream.resource<transient>
%8 = stream.resource.size %6 : !stream.resource<constant>
%9 = stream.resource.size %2 : !stream.resource<external>
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%4 as %arg0: !stream.resource<transient>{%7}, %6 as %arg1: !stream.resource<constant>{%8}, %2 as %arg2: !stream.resource<external>{%9}) -> %2{%9} {
%12 = stream.async.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0[%c0 to %7 for %7], %arg1[%c0 to %8 for %8], %arg2[%c0 to %9 for %9]) : (!stream.resource<transient>{%7}, !stream.resource<constant>{%8}, !stream.resource<external>{%9}) -> %arg2{%9}
stream.yield %12 : !stream.resource<external>{%9}
} => !stream.timepoint
%10 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%9}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%9} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c16 = arith.constant 16 : index
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<constant>{%c16} {
%cst = stream.async.constant : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
stream.yield %cst : !stream.resource<constant>{%c16}
} => !stream.timepoint
util.global.store %results, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %result_timepoint, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.resource.size %__constant_tensor_2x2xi32 : !stream.resource<constant>
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<external>{%c16} {
%12 = stream.async.splat %c0_i8 : i8 -> !stream.resource<external>{%c16}
stream.yield %12 : !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8} {
%12 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c8}
stream.yield %12 : !stream.resource<transient>{%c8}
} => !stream.timepoint
%3 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<transient>
%5 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%0}
%6 = util.optimization_barrier %5 : !stream.resource<constant>
%7 = stream.resource.size %4 : !stream.resource<transient>
%8 = stream.resource.size %6 : !stream.resource<constant>
%9 = stream.resource.size %2 : !stream.resource<external>
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%4 as %arg0: !stream.resource<transient>{%7}, %6 as %arg1: !stream.resource<constant>{%8}, %2 as %arg2: !stream.resource<external>{%9}) -> %2{%9} {
%12 = stream.async.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0[%c0 to %7 for %7], %arg1[%c0 to %8 for %8], %arg2[%c0 to %9 for %9]) : (!stream.resource<transient>{%7}, !stream.resource<constant>{%8}, !stream.resource<external>{%9}) -> %arg2{%9}
stream.yield %12 : !stream.resource<external>{%9}
} => !stream.timepoint
%10 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%9}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%9} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c16 = arith.constant 16 : index
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<constant>{%c16} {
%cst = stream.async.constant : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
stream.yield %cst : !stream.resource<constant>{%c16}
} => !stream.timepoint
util.global.store %results, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %result_timepoint, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.resource.size %__constant_tensor_2x2xi32 : !stream.resource<constant>
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<external>{%c16} {
%12 = stream.async.splat %c0_i8 : i8 -> !stream.resource<external>{%c16}
stream.yield %12 : !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8} {
%12 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c8}
stream.yield %12 : !stream.resource<transient>{%c8}
} => !stream.timepoint
%3 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<transient>
%5 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%0}
%6 = util.optimization_barrier %5 : !stream.resource<constant>
%7 = stream.resource.size %4 : !stream.resource<transient>
%8 = stream.resource.size %6 : !stream.resource<constant>
%9 = stream.resource.size %2 : !stream.resource<external>
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%4 as %arg0: !stream.resource<transient>{%7}, %6 as %arg1: !stream.resource<constant>{%8}, %2 as %arg2: !stream.resource<external>{%9}) -> %2{%9} {
%12 = stream.async.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0[%c0 to %7 for %7], %arg1[%c0 to %8 for %8], %arg2[%c0 to %9 for %9]) : (!stream.resource<transient>{%7}, !stream.resource<constant>{%8}, !stream.resource<external>{%9}) -> %arg2{%9}
stream.yield %12 : !stream.resource<external>{%9}
} => !stream.timepoint
%10 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%9}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%9} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c16 = arith.constant 16 : index
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<constant>{%c16} {
%cst = stream.async.constant : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
stream.yield %cst : !stream.resource<constant>{%c16}
} => !stream.timepoint
util.global.store %results, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %result_timepoint, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.resource.size %__constant_tensor_2x2xi32 : !stream.resource<constant>
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<external>{%c16} {
%12 = stream.async.splat %c0_i8 : i8 -> !stream.resource<external>{%c16}
stream.yield %12 : !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8} {
%12 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c8}
stream.yield %12 : !stream.resource<transient>{%c8}
} => !stream.timepoint
%3 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<transient>
%5 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%0}
%6 = util.optimization_barrier %5 : !stream.resource<constant>
%7 = stream.resource.size %4 : !stream.resource<transient>
%8 = stream.resource.size %6 : !stream.resource<constant>
%9 = stream.resource.size %2 : !stream.resource<external>
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%4 as %arg0: !stream.resource<transient>{%7}, %6 as %arg1: !stream.resource<constant>{%8}, %2 as %arg2: !stream.resource<external>{%9}) -> %2{%9} {
%12 = stream.async.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0[%c0 to %7 for %7], %arg1[%c0 to %8 for %8], %arg2[%c0 to %9 for %9]) : (!stream.resource<transient>{%7}, !stream.resource<constant>{%8}, !stream.resource<external>{%9}) -> %arg2{%9}
stream.yield %12 : !stream.resource<external>{%9}
} => !stream.timepoint
%10 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%9}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%9} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyLoweringToAsyncPass (iree-stream-verify-lowering-to-async) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c16 = arith.constant 16 : index
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<constant>{%c16} {
%cst = stream.async.constant : !stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
stream.yield %cst : !stream.resource<constant>{%c16}
} => !stream.timepoint
util.global.store %results, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %result_timepoint, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.resource.size %__constant_tensor_2x2xi32 : !stream.resource<constant>
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<external>{%c16} {
%12 = stream.async.splat %c0_i8 : i8 -> !stream.resource<external>{%c16}
stream.yield %12 : !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8} {
%12 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c8}
stream.yield %12 : !stream.resource<transient>{%c8}
} => !stream.timepoint
%3 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c8}
%4 = util.optimization_barrier %3 : !stream.resource<transient>
%5 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%0}
%6 = util.optimization_barrier %5 : !stream.resource<constant>
%7 = stream.resource.size %4 : !stream.resource<transient>
%8 = stream.resource.size %6 : !stream.resource<constant>
%9 = stream.resource.size %2 : !stream.resource<external>
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%4 as %arg0: !stream.resource<transient>{%7}, %6 as %arg1: !stream.resource<constant>{%8}, %2 as %arg2: !stream.resource<external>{%9}) -> %2{%9} {
%12 = stream.async.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0[%c0 to %7 for %7], %arg1[%c0 to %8 for %8], %arg2[%c0 to %9 for %9]) : (!stream.resource<transient>{%7}, !stream.resource<constant>{%8}, !stream.resource<external>{%9}) -> %arg2{%9}
stream.yield %12 : !stream.resource<external>{%9}
} => !stream.timepoint
%10 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%9}
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x2xi32> in !stream.resource<external>{%9} -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
}
// -----// IR Dump After ScheduleAllocationPass (iree-stream-schedule-allocation) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c16 = arith.constant 16 : index
%results, %result_timepoint = stream.resource.constants on(#hal.device.affinity<@__device_0>) :
!stream.resource<constant>{%c16} = dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>
=> !stream.timepoint
%0 = stream.cmd.execute once on(#hal.device.affinity<@__device_0>) with() {
} => !stream.timepoint
%1 = stream.timepoint.join max(%result_timepoint, %0) => !stream.timepoint
util.global.store %results, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %1, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.resource.size %__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%c0_0 = arith.constant 0 : index
%1 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0_0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %result : !stream.resource<external>{%c16}
%3 = util.optimization_barrier %2 : !stream.resource<external>
%result_1, %result_timepoint_2 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%c0_3 = arith.constant 0 : index
%4 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_2) => with(%result_1 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0_3 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %result_1 : !stream.resource<transient>{%c8}
%6 = util.optimization_barrier %5 : !stream.resource<transient>
%7 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%0}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %6 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %3 : !stream.resource<external>
%c0_4 = arith.constant 0 : index
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%6 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %3 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %3 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After PackConstantsPass (iree-stream-pack-constants) //----- //
util.initializer {
%c16 = arith.constant 16 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
%c0_i64 = arith.constant 0 : i64
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) {
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant>
} else {
%5 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%6 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %5[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
scf.yield %6, %5 : !stream.timepoint, !stream.resource<constant>
}
%2 = stream.resource.subview %1#1[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%3 = stream.cmd.execute once on(#hal.device.affinity<@__device_0>) with() {
} => !stream.timepoint
%4 = stream.timepoint.join max(%1#0, %3) => !stream.timepoint
util.global.store %2, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %4, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After LayoutSlicesPass (iree-stream-layout-slices) //----- //
util.initializer {
%c16 = arith.constant 16 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
%c0_i64 = arith.constant 0 : i64
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) {
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant>
} else {
%5 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%6 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %5[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
scf.yield %6, %5 : !stream.timepoint, !stream.resource<constant>
}
%2 = stream.resource.subview %1#1[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%3 = stream.cmd.execute once on(#hal.device.affinity<@__device_0>) with() {
} => !stream.timepoint
%4 = stream.timepoint.join max(%1#0, %3) => !stream.timepoint
util.global.store %2, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %4, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After PackConstantsPass (iree-stream-pack-constants) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.resource.size %__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%c0_0 = arith.constant 0 : index
%1 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0_0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %result : !stream.resource<external>{%c16}
%3 = util.optimization_barrier %2 : !stream.resource<external>
%result_1, %result_timepoint_2 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%c0_3 = arith.constant 0 : index
%4 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_2) => with(%result_1 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0_3 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %result_1 : !stream.resource<transient>{%c8}
%6 = util.optimization_barrier %5 : !stream.resource<transient>
%7 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%0}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %6 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %3 : !stream.resource<external>
%c0_4 = arith.constant 0 : index
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%6 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %3 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %3 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump After LayoutSlicesPass (iree-stream-layout-slices) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%0 = stream.resource.size %__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%c0_0 = arith.constant 0 : index
%1 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0_0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %result : !stream.resource<external>{%c16}
%3 = util.optimization_barrier %2 : !stream.resource<external>
%result_1, %result_timepoint_2 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%c0_3 = arith.constant 0 : index
%4 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_2) => with(%result_1 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0_3 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %result_1 : !stream.resource<transient>{%c8}
%6 = util.optimization_barrier %5 : !stream.resource<transient>
%7 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%0}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %6 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %3 : !stream.resource<external>
%c0_4 = arith.constant 0 : index
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%6 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %3 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %3 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump After PropagateSubranges (iree-util-propagate-subranges) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global private mutable @__constant_tensor_2x2xi32__storage_size : index
util.global private mutable @__constant_tensor_2x2xi32__offset : index
util.global private mutable @__constant_tensor_2x2xi32__length : index
util.initializer {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%c64 = arith.constant 64 : index
%c0_0 = arith.constant 0 : index
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0_0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
%c0_i64 = arith.constant 0 : i64
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) {
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant>
} else {
%5 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0_0 for %c64] : !util.buffer{%c64} -> !stream.file
%6 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %5[%c0_0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
scf.yield %6, %5 : !stream.timepoint, !stream.resource<constant>
}
%2 = stream.resource.subview %1#1[%c0_0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%3 = stream.cmd.execute once on(#hal.device.affinity<@__device_0>) with() {
} => !stream.timepoint
%4 = stream.timepoint.join max(%1#0, %3) => !stream.timepoint
util.global.store %1#1, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %c64, @__constant_tensor_2x2xi32__storage_size : index
util.global.store %c0_0, @__constant_tensor_2x2xi32__offset : index
util.global.store %c16, @__constant_tensor_2x2xi32__length : index
util.global.store %4, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__storage_size = util.global.load @__constant_tensor_2x2xi32__storage_size : index
%__constant_tensor_2x2xi32__offset = util.global.load @__constant_tensor_2x2xi32__offset : index
%__constant_tensor_2x2xi32__length = util.global.load @__constant_tensor_2x2xi32__length : index
%0 = stream.resource.subview %__constant_tensor_2x2xi32[%__constant_tensor_2x2xi32__offset] : !stream.resource<constant>{%__constant_tensor_2x2xi32__storage_size} -> !stream.resource<constant>{%__constant_tensor_2x2xi32__length}
%1 = stream.resource.size %0 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%c0_0 = arith.constant 0 : index
%2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0_0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c16}
%4 = util.optimization_barrier %3 : !stream.resource<external>
%result_1, %result_timepoint_2 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%c0_3 = arith.constant 0 : index
%5 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_2) => with(%result_1 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0_3 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%6 = stream.timepoint.await %5 => %result_1 : !stream.resource<transient>{%c8}
%7 = util.optimization_barrier %6 : !stream.resource<transient>
%8 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %0 : !stream.resource<constant>{%1}
%9 = util.optimization_barrier %8 : !stream.resource<constant>
%10 = stream.resource.size %7 : !stream.resource<transient>
%11 = stream.resource.size %9 : !stream.resource<constant>
%12 = stream.resource.size %4 : !stream.resource<external>
%c0_4 = arith.constant 0 : index
%13 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%7 as %arg0: !stream.resource<transient>{%10}, %9 as %arg1: !stream.resource<constant>{%11}, %4 as %arg2: !stream.resource<external>{%12}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %10] : !stream.resource<transient>{%10},
ro %arg1[%c0 for %11] : !stream.resource<constant>{%11},
rw %arg2[%c0 for %12] : !stream.resource<external>{%12}
}
} => !stream.timepoint
%14 = stream.timepoint.await %13 => %4 : !stream.resource<external>{%12}
%15 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %14 : tensor<2x2xi32> in !stream.resource<external>{%12} -> !hal.buffer_view
util.return %15 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) {
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant>
} else {
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant>
}
util.global.store %1#1, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %c64, @__constant_tensor_2x2xi32__storage_size : index
util.global.store %c0, @__constant_tensor_2x2xi32__offset : index
util.global.store %c16, @__constant_tensor_2x2xi32__length : index
util.global.store %1#0, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After CSE (cse) //----- //
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) {
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant>
} else {
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant>
}
util.global.store %1#1, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %c64, @__constant_tensor_2x2xi32__storage_size : index
util.global.store %c0, @__constant_tensor_2x2xi32__offset : index
util.global.store %c16, @__constant_tensor_2x2xi32__length : index
util.global.store %1#0, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) {
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant>
} else {
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant>
}
util.global.store %1#1, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %c16, @__constant_tensor_2x2xi32__length : index
util.global.store %c0, @__constant_tensor_2x2xi32__offset : index
util.global.store %c64, @__constant_tensor_2x2xi32__storage_size : index
util.global.store %1#0, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__storage_size = util.global.load @__constant_tensor_2x2xi32__storage_size : index
%__constant_tensor_2x2xi32__offset = util.global.load @__constant_tensor_2x2xi32__offset : index
%__constant_tensor_2x2xi32__length = util.global.load @__constant_tensor_2x2xi32__length : index
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__storage_size}
%7 = stream.resource.subview %6[%__constant_tensor_2x2xi32__offset] : !stream.resource<constant>{%__constant_tensor_2x2xi32__storage_size} -> !stream.resource<constant>{%__constant_tensor_2x2xi32__length}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%__constant_tensor_2x2xi32__storage_size = util.global.load @__constant_tensor_2x2xi32__storage_size : index
%__constant_tensor_2x2xi32__offset = util.global.load @__constant_tensor_2x2xi32__offset : index
%__constant_tensor_2x2xi32__length = util.global.load @__constant_tensor_2x2xi32__length : index
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__storage_size}
%7 = stream.resource.subview %6[%__constant_tensor_2x2xi32__offset] : !stream.resource<constant>{%__constant_tensor_2x2xi32__storage_size} -> !stream.resource<constant>{%__constant_tensor_2x2xi32__length}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%__constant_tensor_2x2xi32__storage_size = util.global.load @__constant_tensor_2x2xi32__storage_size : index
%__constant_tensor_2x2xi32__offset = util.global.load @__constant_tensor_2x2xi32__offset : index
%__constant_tensor_2x2xi32__length = util.global.load @__constant_tensor_2x2xi32__length : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__storage_size}
%7 = stream.resource.subview %6[%__constant_tensor_2x2xi32__offset] : !stream.resource<constant>{%__constant_tensor_2x2xi32__storage_size} -> !stream.resource<constant>{%__constant_tensor_2x2xi32__length}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global private mutable @__constant_tensor_2x2xi32__storage_size = 64 : index
util.global private mutable @__constant_tensor_2x2xi32__offset = 0 : index
util.global private mutable @__constant_tensor_2x2xi32__length = 16 : index
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) {
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant>
} else {
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant>
}
util.global.store %1#1, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %1#0, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%__constant_tensor_2x2xi32__storage_size = util.global.load @__constant_tensor_2x2xi32__storage_size : index
%__constant_tensor_2x2xi32__offset = util.global.load @__constant_tensor_2x2xi32__offset : index
%__constant_tensor_2x2xi32__length = util.global.load @__constant_tensor_2x2xi32__length : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%__constant_tensor_2x2xi32__storage_size}
%7 = stream.resource.subview %6[%__constant_tensor_2x2xi32__offset] : !stream.resource<constant>{%__constant_tensor_2x2xi32__storage_size} -> !stream.resource<constant>{%__constant_tensor_2x2xi32__length}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) {
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant>
} else {
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant>
}
util.global.store %1#1, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %1#0, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) {
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant>
} else {
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant>
}
util.global.store %1#1, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %1#0, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) {
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant>
} else {
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant>
}
util.global.store %1#1, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %1#0, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyLoweringToCmdPass (iree-stream-verify-lowering-to-cmd) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) {
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant>
} else {
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant>
}
util.global.store %1#1, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %1#0, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) {
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant>
} else {
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant>
}
util.global.store %1#1, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %1#0, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After CSE (cse) //----- //
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) {
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant>
} else {
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant>
}
util.global.store %1#1, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %1#0, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) {
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant>
} else {
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant>
}
util.global.store %1#1, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %1#0, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) {
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant>
} else {
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant>
}
util.global.store %1#1, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %1#0, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) {
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant>
} else {
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant>
}
util.global.store %1#1, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %1#0, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) {
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant>
} else {
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant>
}
util.global.store %1#1, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %1#0, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) {
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant>
} else {
%2 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%3 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %2[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
scf.yield %3, %2 : !stream.timepoint, !stream.resource<constant>
}
util.global.store %1#1, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %1#0, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- //
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb1, ^bb2
^bb1: // pred: ^bb0
cf.br ^bb3(%0, %result : !stream.timepoint, !stream.resource<constant>)
^bb2: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb3(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb3(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb1, ^bb2
cf.br ^bb4
^bb4: // pred: ^bb3
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After CSE (cse) //----- //
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After ElideTimepointsPass (iree-stream-elide-timepoints) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After FuseDispatchBindingsPass (iree-stream-fuse-dispatch-bindings) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg6: i32, %arg7: i32):
iree_linalg_ext.yield %arg6 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%c0_2 = arith.constant 0 : index
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%c0, %c0, %c0 : index, index, index) {
ro %arg0[%c0_2 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0_2 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0_2 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After AnnotateDispatchArgumentsPass (iree-stream-annotate-dispatch-arguments) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: index {stream.values = [0 : index]}, %arg4: index {stream.values = [0 : index]}, %arg5: index {stream.values = [0 : index]}) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%1 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%2 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%3, %4 : tensor<2xi32>, tensor<2x2xi32>) outs(%5 : tensor<2x2xi32>) {
^bb0(%arg6: i32, %arg7: i32):
iree_linalg_ext.yield %arg6 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%c0_2 = arith.constant 0 : index
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%c0, %c0, %c0 : index, index, index) {
ro %arg0[%c0_2 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0_2 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0_2 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After AnnotateDispatchAssumptionsPass (iree-stream-annotate-dispatch-assumptions) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: index {stream.values = [0 : index]}, %arg4: index {stream.values = [0 : index]}, %arg5: index {stream.values = [0 : index]}) {
%0:3 = util.assume.int
%arg3<umin = 0, umax = 0>,
%arg4<umin = 0, umax = 0>,
%arg5<umin = 0, umax = 0>
: index, index, index
%c0 = arith.constant 0 : index
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg6: i32, %arg7: i32):
iree_linalg_ext.yield %arg6 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%c0_2 = arith.constant 0 : index
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%c0, %c0, %c0 : index, index, index) {
ro %arg0[%c0_2 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0_2 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0_2 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After PackDispatchOperandsPass (iree-stream-pack-dispatch-operands) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
%0 = arith.extui %arg3 : i32 to i64
%1 = arith.extui %arg4 : i32 to i64
%c32_i64 = arith.constant 32 : i64
%2 = arith.shli %1, %c32_i64 : i64
%3 = arith.ori %0, %2 : i64
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
%5 = arith.extui %arg5 : i32 to i64
%6 = arith.extui %arg6 : i32 to i64
%c32_i64_0 = arith.constant 32 : i64
%7 = arith.shli %6, %c32_i64_0 : i64
%8 = arith.ori %5, %7 : i64
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
%10 = arith.extui %arg7 : i32 to i64
%11 = arith.extui %arg8 : i32 to i64
%c32_i64_1 = arith.constant 32 : i64
%12 = arith.shli %11, %c32_i64_1 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
%15:3 = util.assume.int
%4<umin = 0, umax = 0>,
%9<umin = 0, umax = 0>,
%14<umin = 0, umax = 0>
: index, index, index
%c0 = arith.constant 0 : index
%16 = stream.binding.subspan %arg0[%15#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%17 = stream.binding.subspan %arg1[%15#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%18 = stream.binding.subspan %arg2[%15#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%19 = flow.dispatch.tensor.load %16, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%20 = flow.dispatch.tensor.load %17, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%21 = flow.dispatch.tensor.load %18, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%22 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%19, %20 : tensor<2xi32>, tensor<2x2xi32>) outs(%21 : tensor<2x2xi32>) {
^bb0(%arg9: i32, %arg10: i32):
iree_linalg_ext.yield %arg9 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %22, %18, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%c0_2 = arith.constant 0 : index
%c0_i64 = arith.constant 0 : i64
%c0_i32 = arith.constant 0 : i32
%c32_i64 = arith.constant 32 : i64
%c0_i64_3 = arith.constant 0 : i64
%c0_i32_4 = arith.constant 0 : i32
%c0_i64_5 = arith.constant 0 : i64
%c0_i32_6 = arith.constant 0 : i32
%c32_i64_7 = arith.constant 32 : i64
%c0_i64_8 = arith.constant 0 : i64
%c0_i32_9 = arith.constant 0 : i32
%c0_i64_10 = arith.constant 0 : i64
%c0_i32_11 = arith.constant 0 : i32
%c32_i64_12 = arith.constant 32 : i64
%c0_i64_13 = arith.constant 0 : i64
%c0_i32_14 = arith.constant 0 : i32
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%c0_i32, %c0_i32_4, %c0_i32_6, %c0_i32_9, %c0_i32_11, %c0_i32_14 : i32, i32, i32, i32, i32, i32) {
ro %arg0[%c0_2 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0_2 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0_2 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After CSE (cse) //----- //
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c0_i32 = arith.constant 0 : i32
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c0_i32 = arith.constant 0 : i32
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%c0_i32 = arith.constant 0 : i32
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
%c32_i64 = arith.constant 32 : i64
%0 = arith.extui %arg3 : i32 to i64
%1 = arith.extui %arg4 : i32 to i64
%2 = arith.shli %1, %c32_i64 : i64
%3 = arith.ori %0, %2 : i64
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
%5 = arith.extui %arg5 : i32 to i64
%6 = arith.extui %arg6 : i32 to i64
%7 = arith.shli %6, %c32_i64 : i64
%8 = arith.ori %5, %7 : i64
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
%10 = arith.extui %arg7 : i32 to i64
%11 = arith.extui %arg8 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
%15:3 = util.assume.int
%4<umin = 0, umax = 0>,
%9<umin = 0, umax = 0>,
%14<umin = 0, umax = 0>
: index, index, index
%16 = stream.binding.subspan %arg0[%15#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%17 = stream.binding.subspan %arg1[%15#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%18 = stream.binding.subspan %arg2[%15#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%19 = flow.dispatch.tensor.load %16, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%20 = flow.dispatch.tensor.load %17, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%21 = flow.dispatch.tensor.load %18, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%22 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%19, %20 : tensor<2xi32>, tensor<2x2xi32>) outs(%21 : tensor<2x2xi32>) {
^bb0(%arg9: i32, %arg10: i32):
iree_linalg_ext.yield %arg9 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %22, %18, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c0_i32 = arith.constant 0 : i32
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
%c32_i64 = arith.constant 32 : i64
%0 = arith.extui %arg3 : i32 to i64
%1 = arith.extui %arg4 : i32 to i64
%2 = arith.shli %1, %c32_i64 : i64
%3 = arith.ori %0, %2 : i64
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
%5 = arith.extui %arg5 : i32 to i64
%6 = arith.extui %arg6 : i32 to i64
%7 = arith.shli %6, %c32_i64 : i64
%8 = arith.ori %5, %7 : i64
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
%10 = arith.extui %arg7 : i32 to i64
%11 = arith.extui %arg8 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
%15:3 = util.assume.int
%4<umin = 0, umax = 0>,
%9<umin = 0, umax = 0>,
%14<umin = 0, umax = 0>
: index, index, index
%16 = stream.binding.subspan %arg0[%15#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%17 = stream.binding.subspan %arg1[%15#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%18 = stream.binding.subspan %arg2[%15#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%19 = flow.dispatch.tensor.load %16, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%20 = flow.dispatch.tensor.load %17, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%21 = flow.dispatch.tensor.load %18, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%22 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%19, %20 : tensor<2xi32>, tensor<2x2xi32>) outs(%21 : tensor<2x2xi32>) {
^bb0(%arg9: i32, %arg10: i32):
iree_linalg_ext.yield %arg9 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %22, %18, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c0_i32 = arith.constant 0 : i32
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
%c32_i64 = arith.constant 32 : i64
%0 = arith.extui %arg3 : i32 to i64
%1 = arith.extui %arg4 : i32 to i64
%2 = arith.shli %1, %c32_i64 : i64
%3 = arith.ori %0, %2 : i64
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
%5 = arith.extui %arg5 : i32 to i64
%6 = arith.extui %arg6 : i32 to i64
%7 = arith.shli %6, %c32_i64 : i64
%8 = arith.ori %5, %7 : i64
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
%10 = arith.extui %arg7 : i32 to i64
%11 = arith.extui %arg8 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
%15:3 = util.assume.int
%4<umin = 0, umax = 0>,
%9<umin = 0, umax = 0>,
%14<umin = 0, umax = 0>
: index, index, index
%16 = stream.binding.subspan %arg0[%15#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%17 = stream.binding.subspan %arg1[%15#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%18 = stream.binding.subspan %arg2[%15#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%19 = flow.dispatch.tensor.load %16, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%20 = flow.dispatch.tensor.load %17, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%21 = flow.dispatch.tensor.load %18, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%22 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%19, %20 : tensor<2xi32>, tensor<2x2xi32>) outs(%21 : tensor<2x2xi32>) {
^bb0(%arg9: i32, %arg10: i32):
iree_linalg_ext.yield %arg9 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %22, %18, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c0_i32 = arith.constant 0 : i32
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
%c32_i64 = arith.constant 32 : i64
%0 = arith.extui %arg3 : i32 to i64
%1 = arith.extui %arg4 : i32 to i64
%2 = arith.shli %1, %c32_i64 : i64
%3 = arith.ori %0, %2 : i64
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
%5 = arith.extui %arg5 : i32 to i64
%6 = arith.extui %arg6 : i32 to i64
%7 = arith.shli %6, %c32_i64 : i64
%8 = arith.ori %5, %7 : i64
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
%10 = arith.extui %arg7 : i32 to i64
%11 = arith.extui %arg8 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
%15:3 = util.assume.int
%4<umin = 0, umax = 0>,
%9<umin = 0, umax = 0>,
%14<umin = 0, umax = 0>
: index, index, index
%16 = stream.binding.subspan %arg0[%15#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%17 = stream.binding.subspan %arg1[%15#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%18 = stream.binding.subspan %arg2[%15#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%19 = flow.dispatch.tensor.load %16, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%20 = flow.dispatch.tensor.load %17, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%21 = flow.dispatch.tensor.load %18, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%22 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%19, %20 : tensor<2xi32>, tensor<2x2xi32>) outs(%21 : tensor<2x2xi32>) {
^bb0(%arg9: i32, %arg10: i32):
iree_linalg_ext.yield %arg9 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %22, %18, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c0_i32 = arith.constant 0 : i32
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After FoldUniformOperandsPass (iree-stream-fold-uniform-operands) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0_i32 = arith.constant 0 : i32
%c32_i64 = arith.constant 32 : i64
%0 = arith.extui %c0_i32 : i32 to i64
%1 = arith.extui %c0_i32 : i32 to i64
%2 = arith.shli %1, %c32_i64 : i64
%3 = arith.ori %0, %2 : i64
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
%5 = arith.extui %c0_i32 : i32 to i64
%6 = arith.extui %c0_i32 : i32 to i64
%7 = arith.shli %6, %c32_i64 : i64
%8 = arith.ori %5, %7 : i64
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
%10 = arith.extui %c0_i32 : i32 to i64
%11 = arith.extui %c0_i32 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
%15:3 = util.assume.int
%4<umin = 0, umax = 0>,
%9<umin = 0, umax = 0>,
%14<umin = 0, umax = 0>
: index, index, index
%16 = stream.binding.subspan %arg0[%15#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%17 = stream.binding.subspan %arg1[%15#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%18 = stream.binding.subspan %arg2[%15#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%19 = flow.dispatch.tensor.load %16, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%20 = flow.dispatch.tensor.load %17, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%21 = flow.dispatch.tensor.load %18, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%22 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%19, %20 : tensor<2xi32>, tensor<2x2xi32>) outs(%21 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %22, %18, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c0_i32 = arith.constant 0 : i32
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After CSE (cse) //----- //
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c0_i8 = arith.constant 0 : i8
%c1_i32 = arith.constant 1 : i32
%c64 = arith.constant 64 : index
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @scatter_dispatch_0 {
stream.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg3: i32, %arg4: i32):
iree_linalg_ext.yield %arg3 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
hal.executable private @scatter_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
hal.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@embedded_elf_x86_64::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- //
#composite_of_64b = #util.composite<64xi8, [
dense<[[0, 0], [1, 1]]> : tensor<2x2xi32>,
dense<0> : vector<48xi8>,
]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
hal.executable private @scatter_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
hal.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
}
util.global private @__constant_tensor_2x2xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint
util.global private @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.initializer {
%c0_i64 = arith.constant 0 : i64
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%0 = stream.timepoint.immediate => !stream.timepoint
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_64b
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64}
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1
^bb1: // pred: ^bb0
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c64}
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c64] : !util.buffer{%c64} -> !stream.file
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c64 : !stream.file -> !stream.resource<constant>{%c64} => !stream.timepoint
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>)
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1
util.global.store %4, @__constant_tensor_2x2xi32 : !stream.resource<constant>
util.global.store %3, @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
util.return
}
util.func public @scatter() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @scatter() -> (%output0: tensor<2x2xi32>)"}} {
%c64 = arith.constant 64 : index
%c1_i32 = arith.constant 1 : i32
%c0_i8 = arith.constant 0 : i8
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%__constant_tensor_2x2xi32__timepoint = util.global.load immutable @__constant_tensor_2x2xi32__timepoint : !stream.timepoint
%__constant_tensor_2x2xi32 = util.global.load immutable @__constant_tensor_2x2xi32 : !stream.resource<constant>
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c16} => !stream.timepoint
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<external>{%c16}) {
stream.cmd.fill %c0_i8, %arg0[%c0 for %c16] : i8 -> !stream.resource<external>{%c16}
} => !stream.timepoint
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c16}
%2 = util.optimization_barrier %1 : !stream.resource<external>
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8} => !stream.timepoint
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c8}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c8] : i32 -> !stream.resource<transient>{%c8}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c8}
%5 = util.optimization_barrier %4 : !stream.resource<transient>
%6 = stream.timepoint.await %__constant_tensor_2x2xi32__timepoint => %__constant_tensor_2x2xi32 : !stream.resource<constant>{%c64}
%7 = stream.resource.subview %6[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c16}
%8 = util.optimization_barrier %7 : !stream.resource<constant>
%9 = stream.resource.size %5 : !stream.resource<transient>
%10 = stream.resource.size %8 : !stream.resource<constant>
%11 = stream.resource.size %2 : !stream.resource<external>
%12 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) with(%5 as %arg0: !stream.resource<transient>{%9}, %8 as %arg1: !stream.resource<constant>{%10}, %2 as %arg2: !stream.resource<external>{%11}) {
stream.cmd.dispatch @scatter_dispatch_0::@embedded_elf_x86_64::@scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store {
ro %arg0[%c0 for %9] : !stream.resource<transient>{%9},
ro %arg1[%c0 for %10] : !stream.resource<constant>{%10},
rw %arg2[%c0 for %11] : !stream.resource<external>{%11}
}
} => !stream.timepoint
%13 = stream.timepoint.await %12 => %2 : !stream.resource<external>{%11}
%14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x2xi32> in !stream.resource<external>{%11} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
// -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
// -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
// -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- //
module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
// -----// IR Dump After RematerializeParallelOpsPass (iree-codegen-rematerialize-parallel-ops) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
// -----// IR Dump After ExpandF16OpToF32Pass (iree-llvmcpu-expand-f16-op-to-f32) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
// -----// IR Dump After CPUMaterializeDeviceEncodingPass (iree-codegen-cpu-materialize-device-encoding) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
// -----// IR Dump After EraseHALDescriptorTypeFromMemRefPass (iree-codegen-erase-hal-descriptor-type-from-memref) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
// -----// IR Dump After LLVMCPUSelectLoweringStrategyPass (iree-llvmcpu-select-lowering-strategy) //----- //
module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1]]>} dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
// -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- //
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
hal.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1]]>} dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
// -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- //
hal.executable private @scatter_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
hal.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1]]>} dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
}
}
// -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- //
module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = iree_linalg_ext.scatter {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1]]>} dimension_map = [0, 1] unique_indices(true) ins(%4, %5 : tensor<2xi32>, tensor<2x2xi32>) outs(%6 : tensor<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
} -> tensor<2x2xi32>
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
}
// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = scf.forall (%arg0) in (2) shared_outs(%arg1 = %6) -> (tensor<2x2xi32>) {
%extracted_slice = tensor.extract_slice %4[%arg0] [1] [1] : tensor<2xi32> to tensor<1xi32>
%extracted_slice_0 = tensor.extract_slice %5[%arg0, 0] [1, 2] [1, 1] : tensor<2x2xi32> to tensor<1x2xi32>
%8 = iree_linalg_ext.scatter {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1]]>} dimension_map = [0, 1] unique_indices(true) ins(%extracted_slice, %extracted_slice_0 : tensor<1xi32>, tensor<1x2xi32>) outs(%arg1 : tensor<2x2xi32>) {
^bb0(%arg2: i32, %arg3: i32):
iree_linalg_ext.yield %arg2 : i32
} -> tensor<2x2xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg1[0, 0] [2, 2] [1, 1] : tensor<2x2xi32> into tensor<2x2xi32>
}
} {mapping = [#iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = scf.forall (%arg0) in (2) shared_outs(%arg1 = %6) -> (tensor<2x2xi32>) {
%extracted_slice = tensor.extract_slice %4[%arg0] [1] [1] : tensor<2xi32> to tensor<1xi32>
%extracted_slice_0 = tensor.extract_slice %5[%arg0, 0] [1, 2] [1, 1] : tensor<2x2xi32> to tensor<1x2xi32>
%8 = iree_linalg_ext.scatter {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1]]>} dimension_map = [0, 1] unique_indices(true) ins(%extracted_slice, %extracted_slice_0 : tensor<1xi32>, tensor<1x2xi32>) outs(%arg1 : tensor<2x2xi32>) {
^bb0(%arg2: i32, %arg3: i32):
iree_linalg_ext.yield %arg2 : i32
} -> tensor<2x2xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg1[0, 0] [2, 2] [1, 1] : tensor<2x2xi32> into tensor<2x2xi32>
}
} {mapping = [#iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = scf.forall (%arg0) in (2) shared_outs(%arg1 = %6) -> (tensor<2x2xi32>) {
%extracted_slice = tensor.extract_slice %4[%arg0] [1] [1] : tensor<2xi32> to tensor<1xi32>
%extracted_slice_0 = tensor.extract_slice %5[%arg0, 0] [1, 2] [1, 1] : tensor<2x2xi32> to tensor<1x2xi32>
%8 = iree_linalg_ext.scatter {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1]]>} dimension_map = [0, 1] unique_indices(true) ins(%extracted_slice, %extracted_slice_0 : tensor<1xi32>, tensor<1x2xi32>) outs(%arg1 : tensor<2x2xi32>) {
^bb0(%arg2: i32, %arg3: i32):
iree_linalg_ext.yield %arg2 : i32
} -> tensor<2x2xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg1[0, 0] [2, 2] [1, 1] : tensor<2x2xi32> into tensor<2x2xi32>
}
} {mapping = [#iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = scf.forall (%arg0) in (2) shared_outs(%arg1 = %6) -> (tensor<2x2xi32>) {
%extracted_slice = tensor.extract_slice %4[%arg0] [1] [1] : tensor<2xi32> to tensor<1xi32>
%extracted_slice_0 = tensor.extract_slice %5[%arg0, 0] [1, 2] [1, 1] : tensor<2x2xi32> to tensor<1x2xi32>
%8 = iree_linalg_ext.scatter {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1]]>} dimension_map = [0, 1] unique_indices(true) ins(%extracted_slice, %extracted_slice_0 : tensor<1xi32>, tensor<1x2xi32>) outs(%arg1 : tensor<2x2xi32>) {
^bb0(%arg2: i32, %arg3: i32):
iree_linalg_ext.yield %arg2 : i32
} -> tensor<2x2xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg1[0, 0] [2, 2] [1, 1] : tensor<2x2xi32> into tensor<2x2xi32>
}
} {mapping = [#iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = scf.forall (%arg0) in (2) shared_outs(%arg1 = %6) -> (tensor<2x2xi32>) {
%extracted_slice = tensor.extract_slice %4[%arg0] [1] [1] : tensor<2xi32> to tensor<1xi32>
%extracted_slice_0 = tensor.extract_slice %5[%arg0, 0] [1, 2] [1, 1] : tensor<2x2xi32> to tensor<1x2xi32>
%8 = iree_linalg_ext.scatter {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1]]>} dimension_map = [0, 1] unique_indices(true) ins(%extracted_slice, %extracted_slice_0 : tensor<1xi32>, tensor<1x2xi32>) outs(%arg1 : tensor<2x2xi32>) {
^bb0(%arg2: i32, %arg3: i32):
iree_linalg_ext.yield %arg2 : i32
} -> tensor<2x2xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg1[0, 0] [2, 2] [1, 1] : tensor<2x2xi32> into tensor<2x2xi32>
}
} {mapping = [#iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
// -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = scf.forall (%arg0) in (2) shared_outs(%arg1 = %6) -> (tensor<2x2xi32>) {
%extracted_slice = tensor.extract_slice %4[%arg0] [1] [1] : tensor<2xi32> to tensor<1xi32>
%extracted_slice_0 = tensor.extract_slice %5[%arg0, 0] [1, 2] [1, 1] : tensor<2x2xi32> to tensor<1x2xi32>
%8 = iree_linalg_ext.scatter {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1]]>} dimension_map = [0, 1] unique_indices(true) ins(%extracted_slice, %extracted_slice_0 : tensor<1xi32>, tensor<1x2xi32>) outs(%arg1 : tensor<2x2xi32>) {
^bb0(%arg2: i32, %arg3: i32):
iree_linalg_ext.yield %arg2 : i32
} -> tensor<2x2xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg1[0, 0] [2, 2] [1, 1] : tensor<2x2xi32> into tensor<2x2xi32>
}
} {mapping = [#iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2xi32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xi32>> -> tensor<2xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2xi32>> -> tensor<2x2xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x2xi32>> -> tensor<2x2xi32>
%7 = scf.forall (%arg0) in (2) shared_outs(%arg1 = %6) -> (tensor<2x2xi32>) {
%extracted_slice = tensor.extract_slice %4[%arg0] [1] [1] : tensor<2xi32> to tensor<1xi32>
%extracted_slice_0 = tensor.extract_slice %5[%arg0, 0] [1, 2] [1, 1] : tensor<2x2xi32> to tensor<1x2xi32>
%8 = iree_linalg_ext.scatter {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1]]>} dimension_map = [0, 1] unique_indices(true) ins(%extracted_slice, %extracted_slice_0 : tensor<1xi32>, tensor<1x2xi32>) outs(%arg1 : tensor<2x2xi32>) {
^bb0(%arg2: i32, %arg3: i32):
iree_linalg_ext.yield %arg2 : i32
} -> tensor<2x2xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg1[0, 0] [2, 2] [1, 1] : tensor<2x2xi32> into tensor<2x2xi32>
}
} {mapping = [#iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [2, 2], strides = [1, 1] : tensor<2x2xi32> -> !flow.dispatch.tensor<readwrite:tensor<2x2xi32>>
return
}
// -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<2x2xi32>
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0) in (2) {
%subview = memref.subview %1[%arg0] [1] [1] : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %2[%arg0, 0] [1, 2] [1, 1] : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.scatter {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1]]>} dimension_map = [0, 1] unique_indices(true) ins(%subview, %subview_0 : memref<1xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%alloca : memref<2x2xi32>) {
^bb0(%arg1: i32, %arg2: i32):
iree_linalg_ext.yield %arg1 : i32
}
%subview_1 = memref.subview %3[0, 0] [2, 2] [1, 1] : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloca : memref<2x2xi32>) outs(%subview_1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#iree_codegen.workgroup_mapping<x>]}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%3 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%3 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
return
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<2x2xi32>
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0) in (2) {
%subview = memref.subview %1[%arg0] [1] [1] : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %2[%arg0, 0] [1, 2] [1, 1] : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.scatter {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1]]>} dimension_map = [0, 1] unique_indices(true) ins(%subview, %subview_0 : memref<1xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%alloca : memref<2x2xi32>) {
^bb0(%arg1: i32, %arg2: i32):
iree_linalg_ext.yield %arg1 : i32
}
%subview_1 = memref.subview %3[0, 0] [2, 2] [1, 1] : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloca : memref<2x2xi32>) outs(%subview_1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#iree_codegen.workgroup_mapping<x>]}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%3 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%3 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<2x2xi32>
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0) in (2) {
%subview = memref.subview %1[%arg0] [1] [1] : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %2[%arg0, 0] [1, 2] [1, 1] : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.scatter {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1]]>} dimension_map = [0, 1] unique_indices(true) ins(%subview, %subview_0 : memref<1xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%alloca : memref<2x2xi32>) {
^bb0(%arg1: i32, %arg2: i32):
iree_linalg_ext.yield %arg1 : i32
}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloca : memref<2x2xi32>) outs(%3 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<2x2xi32>
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0) in (2) {
%subview = memref.subview %1[%arg0] [1] [1] : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %2[%arg0, 0] [1, 2] [1, 1] : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.scatter {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1]]>} dimension_map = [0, 1] unique_indices(true) ins(%subview, %subview_0 : memref<1xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%alloca : memref<2x2xi32>) {
^bb0(%arg1: i32, %arg2: i32):
iree_linalg_ext.yield %arg1 : i32
}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloca : memref<2x2xi32>) outs(%3 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<2x2xi32>
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0) in (2) {
%subview = memref.subview %1[%arg0] [1] [1] : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %2[%arg0, 0] [1, 2] [1, 1] : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.scatter {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1]]>} dimension_map = [0, 1] unique_indices(true) ins(%subview, %subview_0 : memref<1xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%alloca : memref<2x2xi32>) {
^bb0(%arg1: i32, %arg2: i32):
iree_linalg_ext.yield %arg1 : i32
}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloca : memref<2x2xi32>) outs(%3 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<2x2xi32>
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0) in (2) {
%subview = memref.subview %1[%arg0] [1] [1] : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %2[%arg0, 0] [1, 2] [1, 1] : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.scatter {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1]]>} dimension_map = [0, 1] unique_indices(true) ins(%subview, %subview_0 : memref<1xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%alloca : memref<2x2xi32>) {
^bb0(%arg1: i32, %arg2: i32):
iree_linalg_ext.yield %arg1 : i32
}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloca : memref<2x2xi32>) outs(%3 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After LLVMCPULowerExecutableTargetPass (iree-llvmcpu-lower-executable-target) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<2x2xi32>
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0) in (2) {
%subview = memref.subview %1[%arg0] [1] [1] : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %2[%arg0, 0] [1, 2] [1, 1] : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.scatter {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1]]>} dimension_map = [0, 1] unique_indices(true) ins(%subview, %subview_0 : memref<1xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%alloca : memref<2x2xi32>) {
^bb0(%arg1: i32, %arg2: i32):
iree_linalg_ext.yield %arg1 : i32
}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloca : memref<2x2xi32>) outs(%3 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After ReconcileTranslationInfoPass (iree-codegen-reconcile-translation-info) //----- //
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
hal.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} {
^bb0(%arg0: !hal.device):
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c1_0 = arith.constant 1 : index
hal.return %c2, %c1, %c1_0 : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c0 = arith.constant 0 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<2x2xi32>
%0:3 = util.assume.int
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>,
%c0<umin = 0, umax = 0>
: index, index, index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%0#0) flags("ReadOnly|Indirect") : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%0#1) flags("ReadOnly|Indirect") : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%0#2) flags(Indirect) : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%subview = memref.subview %1[%workgroup_id_x] [1] [1] : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %2[%workgroup_id_x, 0] [1, 2] [1, 1] : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%subview, %subview_0 : memref<1xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%alloca : memref<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloca : memref<2x2xi32>) outs(%3 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
return
}
}
}
// -----// IR Dump After DropCompilerHints (iree-util-drop-compiler-hints) //----- //
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
hal.executable.export public @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} {
^bb0(%arg0: !hal.device):
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c1_0 = arith.constant 1 : index
hal.return %c2, %c1, %c1_0 : index, index, index
}
builtin.module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c0 = arith.constant 0 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<2x2xi32>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%subview = memref.subview %0[%workgroup_id_x] [1] [1] : memref<2xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[%workgroup_id_x, 0] [1, 2] [1, 1] : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%subview, %subview_0 : memref<1xi32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%alloca : memref<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloca : memref<2x2xi32>) outs(%2 : memref<2x2xi32, strided<[2, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
return
}
}
}
// -----// IR Dump After EraseHALDescriptorTypeFromMemRefPass (iree-codegen-erase-hal-descriptor-type-from-memref) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c0 = arith.constant 0 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<2x2xi32>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xi32, strided<[1], offset: ?>>
memref.assume_alignment %0, 1 : memref<2xi32, strided<[1], offset: ?>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x2xi32, strided<[2, 1], offset: ?>>
memref.assume_alignment %1, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x2xi32, strided<[2, 1], offset: ?>>
memref.assume_alignment %2, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%subview = memref.subview %0[%workgroup_id_x] [1] [1] : memref<2xi32, strided<[1], offset: ?>> to memref<1xi32, strided<[1], offset: ?>>
%subview_0 = memref.subview %1[%workgroup_id_x, 0] [1, 2] [1, 1] : memref<2x2xi32, strided<[2, 1], offset: ?>> to memref<1x2xi32, strided<[2, 1], offset: ?>>
iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%subview, %subview_0 : memref<1xi32, strided<[1], offset: ?>>, memref<1x2xi32, strided<[2, 1], offset: ?>>) outs(%alloca : memref<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloca : memref<2x2xi32>) outs(%2 : memref<2x2xi32, strided<[2, 1], offset: ?>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
return
}
// -----// IR Dump After LowerUKernelOpsToCallsPass (iree-codegen-lower-ukernel-ops-to-calls) //----- //
module {
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c0 = arith.constant 0 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<2x2xi32>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xi32, strided<[1], offset: ?>>
memref.assume_alignment %0, 1 : memref<2xi32, strided<[1], offset: ?>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x2xi32, strided<[2, 1], offset: ?>>
memref.assume_alignment %1, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x2xi32, strided<[2, 1], offset: ?>>
memref.assume_alignment %2, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%subview = memref.subview %0[%workgroup_id_x] [1] [1] : memref<2xi32, strided<[1], offset: ?>> to memref<1xi32, strided<[1], offset: ?>>
%subview_0 = memref.subview %1[%workgroup_id_x, 0] [1, 2] [1, 1] : memref<2x2xi32, strided<[2, 1], offset: ?>> to memref<1x2xi32, strided<[2, 1], offset: ?>>
iree_linalg_ext.scatter dimension_map = [0, 1] unique_indices(true) ins(%subview, %subview_0 : memref<1xi32, strided<[1], offset: ?>>, memref<1x2xi32, strided<[2, 1], offset: ?>>) outs(%alloca : memref<2x2xi32>) {
^bb0(%arg0: i32, %arg1: i32):
iree_linalg_ext.yield %arg0 : i32
}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloca : memref<2x2xi32>) outs(%2 : memref<2x2xi32, strided<[2, 1], offset: ?>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
return
}
}
// -----// IR Dump After LinalgExtToLoopsPass (iree-linalg-ext-to-loops) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<2x2xi32>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xi32, strided<[1], offset: ?>>
memref.assume_alignment %0, 1 : memref<2xi32, strided<[1], offset: ?>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x2xi32, strided<[2, 1], offset: ?>>
memref.assume_alignment %1, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x2xi32, strided<[2, 1], offset: ?>>
memref.assume_alignment %2, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%subview = memref.subview %0[%workgroup_id_x] [1] [1] : memref<2xi32, strided<[1], offset: ?>> to memref<1xi32, strided<[1], offset: ?>>
%subview_0 = memref.subview %1[%workgroup_id_x, 0] [1, 2] [1, 1] : memref<2x2xi32, strided<[2, 1], offset: ?>> to memref<1x2xi32, strided<[2, 1], offset: ?>>
scf.for %arg0 = %c0 to %c1 step %c1 {
%3 = memref.load %subview[%arg0] : memref<1xi32, strided<[1], offset: ?>>
%4 = memref.load %subview_0[%arg0, %c0] : memref<1x2xi32, strided<[2, 1], offset: ?>>
%5 = arith.index_cast %4 : i32 to index
%6 = memref.load %subview_0[%arg0, %c1] : memref<1x2xi32, strided<[2, 1], offset: ?>>
%7 = arith.index_cast %6 : i32 to index
memref.store %3, %alloca[%5, %7] : memref<2x2xi32>
}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloca : memref<2x2xi32>) outs(%2 : memref<2x2xi32, strided<[2, 1], offset: ?>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
return
}
// -----// IR Dump After MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<2x2xi32>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xi32, strided<[1], offset: ?>>
memref.assume_alignment %0, 1 : memref<2xi32, strided<[1], offset: ?>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x2xi32, strided<[2, 1], offset: ?>>
memref.assume_alignment %1, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x2xi32, strided<[2, 1], offset: ?>>
memref.assume_alignment %2, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%subview = memref.subview %0[%workgroup_id_x] [1] [1] : memref<2xi32, strided<[1], offset: ?>> to memref<1xi32, strided<[1], offset: ?>>
%subview_0 = memref.subview %1[%workgroup_id_x, 0] [1, 2] [1, 1] : memref<2x2xi32, strided<[2, 1], offset: ?>> to memref<1x2xi32, strided<[2, 1], offset: ?>>
scf.for %arg0 = %c0 to %c1 step %c1 {
%3 = memref.load %subview[%arg0] : memref<1xi32, strided<[1], offset: ?>>
%4 = memref.load %subview_0[%arg0, %c0] : memref<1x2xi32, strided<[2, 1], offset: ?>>
%5 = arith.index_cast %4 : i32 to index
%6 = memref.load %subview_0[%arg0, %c1] : memref<1x2xi32, strided<[2, 1], offset: ?>>
%7 = arith.index_cast %6 : i32 to index
memref.store %3, %alloca[%5, %7] : memref<2x2xi32>
}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloca : memref<2x2xi32>) outs(%2 : memref<2x2xi32, strided<[2, 1], offset: ?>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
return
}
// -----// IR Dump After ConvertLinalgToLoopsPass (convert-linalg-to-loops) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<2x2xi32>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xi32, strided<[1], offset: ?>>
memref.assume_alignment %0, 1 : memref<2xi32, strided<[1], offset: ?>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x2xi32, strided<[2, 1], offset: ?>>
memref.assume_alignment %1, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x2xi32, strided<[2, 1], offset: ?>>
memref.assume_alignment %2, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%subview = memref.subview %0[%workgroup_id_x] [1] [1] : memref<2xi32, strided<[1], offset: ?>> to memref<1xi32, strided<[1], offset: ?>>
%subview_0 = memref.subview %1[%workgroup_id_x, 0] [1, 2] [1, 1] : memref<2x2xi32, strided<[2, 1], offset: ?>> to memref<1x2xi32, strided<[2, 1], offset: ?>>
scf.for %arg0 = %c0 to %c1 step %c1 {
%3 = memref.load %subview[%arg0] : memref<1xi32, strided<[1], offset: ?>>
%4 = memref.load %subview_0[%arg0, %c0] : memref<1x2xi32, strided<[2, 1], offset: ?>>
%5 = arith.index_cast %4 : i32 to index
%6 = memref.load %subview_0[%arg0, %c1] : memref<1x2xi32, strided<[2, 1], offset: ?>>
%7 = arith.index_cast %6 : i32 to index
memref.store %3, %alloca[%5, %7] : memref<2x2xi32>
}
scf.for %arg0 = %c0 to %c2 step %c1 {
scf.for %arg1 = %c0 to %c2 step %c1 {
%3 = memref.load %alloca[%arg0, %arg1] : memref<2x2xi32>
memref.store %3, %2[%arg0, %arg1] : memref<2x2xi32, strided<[2, 1], offset: ?>>
}
}
return
}
// -----// IR Dump After ConvertBf16ArithToF32Pass (iree-convert-bf16-arith-to-f32) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<2x2xi32>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xi32, strided<[1], offset: ?>>
memref.assume_alignment %0, 1 : memref<2xi32, strided<[1], offset: ?>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x2xi32, strided<[2, 1], offset: ?>>
memref.assume_alignment %1, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x2xi32, strided<[2, 1], offset: ?>>
memref.assume_alignment %2, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%subview = memref.subview %0[%workgroup_id_x] [1] [1] : memref<2xi32, strided<[1], offset: ?>> to memref<1xi32, strided<[1], offset: ?>>
%subview_0 = memref.subview %1[%workgroup_id_x, 0] [1, 2] [1, 1] : memref<2x2xi32, strided<[2, 1], offset: ?>> to memref<1x2xi32, strided<[2, 1], offset: ?>>
scf.for %arg0 = %c0 to %c1 step %c1 {
%3 = memref.load %subview[%arg0] : memref<1xi32, strided<[1], offset: ?>>
%4 = memref.load %subview_0[%arg0, %c0] : memref<1x2xi32, strided<[2, 1], offset: ?>>
%5 = arith.index_cast %4 : i32 to index
%6 = memref.load %subview_0[%arg0, %c1] : memref<1x2xi32, strided<[2, 1], offset: ?>>
%7 = arith.index_cast %6 : i32 to index
memref.store %3, %alloca[%5, %7] : memref<2x2xi32>
}
scf.for %arg0 = %c0 to %c2 step %c1 {
scf.for %arg1 = %c0 to %c2 step %c1 {
%3 = memref.load %alloca[%arg0, %arg1] : memref<2x2xi32>
memref.store %3, %2[%arg0, %arg1] : memref<2x2xi32, strided<[2, 1], offset: ?>>
}
}
return
}
// -----// IR Dump After ConvertBf16ToUInt16BuffersPass (iree-codegen-convert-bf16-to-uint16-buffers) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<2x2xi32>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xi32, strided<[1], offset: ?>>
memref.assume_alignment %0, 1 : memref<2xi32, strided<[1], offset: ?>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x2xi32, strided<[2, 1], offset: ?>>
memref.assume_alignment %1, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x2xi32, strided<[2, 1], offset: ?>>
memref.assume_alignment %2, 1 : memref<2x2xi32, strided<[2, 1], offset: ?>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%subview = memref.subview %0[%workgroup_id_x] [1] [1] : memref<2xi32, strided<[1], offset: ?>> to memref<1xi32, strided<[1], offset: ?>>
%subview_0 = memref.subview %1[%workgroup_id_x, 0] [1, 2] [1, 1] : memref<2x2xi32, strided<[2, 1], offset: ?>> to memref<1x2xi32, strided<[2, 1], offset: ?>>
scf.for %arg0 = %c0 to %c1 step %c1 {
%3 = memref.load %subview[%arg0] : memref<1xi32, strided<[1], offset: ?>>
%4 = memref.load %subview_0[%arg0, %c0] : memref<1x2xi32, strided<[2, 1], offset: ?>>
%5 = arith.index_cast %4 : i32 to index
%6 = memref.load %subview_0[%arg0, %c1] : memref<1x2xi32, strided<[2, 1], offset: ?>>
%7 = arith.index_cast %6 : i32 to index
memref.store %3, %alloca[%5, %7] : memref<2x2xi32>
}
scf.for %arg0 = %c0 to %c2 step %c1 {
scf.for %arg1 = %c0 to %c2 step %c1 {
%3 = memref.load %alloca[%arg0, %arg1] : memref<2x2xi32>
memref.store %3, %2[%arg0, %arg1] : memref<2x2xi32, strided<[2, 1], offset: ?>>
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @scatter_dispatch_0_scatter_2x2xi32_dispatch_tensor_store() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<2x2xi32>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xi32, strided<[1], offset: ?>>
memref.assume_alignment %0, 1 : memref<2xi32, strided<[1], offset: ?>>
%1 = hal.interface
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment