Skip to content

Instantly share code, notes, and snippets.

@benvanik
Created June 15, 2022 19:41
Show Gist options
  • Save benvanik/c9b4eabde8801ee66e39813e3392187a to your computer and use it in GitHub Desktop.
Save benvanik/c9b4eabde8801ee66e39813e3392187a to your computer and use it in GitHub Desktop.
simple_mul.mlir
{
"name": "(gdb) iree-compile",
"type": "cppdbg",
"request": "launch",
"preLaunchTask": "build-iree-compile",
"program": "${command:cmake.buildDirectory}/tools/iree-compile",
"args": [
// "-iree-vm-bytecode-module-output-format=annotated-mlir-text",
"-iree-vm-bytecode-source-listing=${workspaceFolder}/../iree-tmp/vm.mlir",
"-iree-vm-emit-polyglot-zip=true",
// "-mlir-elide-elementsattrs-if-larger=8192",
"-mlir-disable-threading",
// "-mlir-print-ir-before-all",
// "-mlir-print-ir-after-all",
// "-iree-hal-dump-executable-sources-to=${workspaceFolder}/../iree-tmp/executables/",
"-iree-hal-target-backends=dylib-llvm-aot",
"-iree-llvm-target-triple=x86_64-pc-linux-elf",
"-iree-llvm-link-embedded",
// "-iree-llvm-keep-linker-artifacts",
// "-iree-input-type=tosa",
"-iree-input-type=mhlo",
"${workspaceFolder}/runtime/src/iree/runtime/testdata/simple_mul.mlir",
"-o=${workspaceFolder}/../iree-tmp/simple_mul.vmfb",
// "-iree-input-type=mhlo",
// "${workspaceFolder}/iree/test/e2e/models/mobilenetv3_fake_weights.mlir",
// "-o=${workspaceFolder}/../iree-tmp/mobilenetv3_fake_weights.vmfb",
// "${workspaceFolder}/iree/test/e2e/models/unidirectional_lstm.mlir",
// "-o=${workspaceFolder}/../iree-tmp/unidirectional_lstm.vmfb",
">",
"${workspaceFolder}/../iree-tmp/iree-compile-out.txt",
"2>&1"
],
"stopAtEntry": false,
"cwd": "${workspaceFolder}",
// "internalConsoleOptions": "openOnSessionStart",
"externalConsole": false,
"MIMode": "gdb",
"setupCommands": [
{
"description": "Enable pretty-printing for gdb",
"text": "-enable-pretty-printing",
"ignoreFailures": true
}
],
// "visualizerFile": "${workspaceFolder}/iree.natvis"
},
{
"name": "(lldb) iree-run-module",
"type": "cppdbg",
// "type": "lldb",
"request": "launch",
"preLaunchTask": "build-iree-run-module",
"program": "${command:cmake.buildDirectory}/tools/iree-run-module",
"args": [
"--trace_execution",
"--device=local-sync",
"--module_file=${workspaceFolder}/../iree-tmp/simple_mul.vmfb",
"--entry_function=simple_mul",
"--function_input=4xf32=12",
"--function_input=4xf32=2",
">",
"${workspaceFolder}/../iree-tmp/iree-run-module-out.txt",
"2>&1"
],
"stopAtEntry": false,
"cwd": "${workspaceFolder}",
"environment": [
{
"name": "TRACY_NO_EXIT",
"value": "1",
}
],
// "internalConsoleOptions": "openOnSessionStart",
"externalConsole": false,
"MIMode": "gdb",
// "MIMode": "lldb",
// "miDebuggerPath": "/usr/bin/lldb",
"setupCommands": [
{
"description": "Enable pretty-printing for gdb",
"text": "-enable-pretty-printing",
"ignoreFailures": true
}
],
"visualizerFile": "${workspaceFolder}/iree.natvis"
},
This file has been truncated, but you can view the full file.
// -----// IR Dump After TopLevelSCFToCFG //----- //
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
return %0 : tensor<4xf32>
}
// -----// IR Dump After MHLOToMHLOPreprocessing //----- //
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
return %0 : tensor<4xf32>
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
return %0 : tensor<4xf32>
}
// -----// IR Dump After ShapeToShapeLowering //----- //
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
return %0 : tensor<4xf32>
}
// -----// IR Dump After ConvertShapeToStandard //----- //
module {
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
return %0 : tensor<4xf32>
}
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
return %0 : tensor<4xf32>
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
return %0 : tensor<4xf32>
}
// -----// IR Dump After Inliner //----- //
module {
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
return %0 : tensor<4xf32>
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::DemoteI64ToI32Pass //----- //
module {
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
return %0 : tensor<4xf32>
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::DemoteF64ToF32Pass //----- //
module {
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
return %0 : tensor<4xf32>
}
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
return %0 : tensor<4xf32>
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
return %0 : tensor<4xf32>
}
// -----// IR Dump After ConvertMHLOToLinalgExt //----- //
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
return %0 : tensor<4xf32>
}
// -----// IR Dump After ConvertMHLOToLinalgOnTensors //----- //
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = linalg.init_tensor [4] : tensor<4xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<4xf32>, tensor<4xf32>) outs(%0 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%2 = arith.mulf %arg2, %arg3 : f32
linalg.yield %2 : f32
} -> tensor<4xf32>
return %1 : tensor<4xf32>
}
// -----// IR Dump After ReconcileUnrealizedCasts //----- //
#map = affine_map<(d0) -> (d0)>
module {
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = linalg.init_tensor [4] : tensor<4xf32>
%1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<4xf32>, tensor<4xf32>) outs(%0 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%2 = arith.mulf %arg2, %arg3 : f32
linalg.yield %2 : f32
} -> tensor<4xf32>
return %1 : tensor<4xf32>
}
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = linalg.init_tensor [4] : tensor<4xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<4xf32>, tensor<4xf32>) outs(%0 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%2 = arith.mulf %arg2, %arg3 : f32
linalg.yield %2 : f32
} -> tensor<4xf32>
return %1 : tensor<4xf32>
}
// -----// IR Dump After VerifyCompilerMHLOInputLegality //----- //
#map = affine_map<(d0) -> (d0)>
module {
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = linalg.init_tensor [4] : tensor<4xf32>
%1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<4xf32>, tensor<4xf32>) outs(%0 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%2 = arith.mulf %arg2, %arg3 : f32
linalg.yield %2 : f32
} -> tensor<4xf32>
return %1 : tensor<4xf32>
}
}
// -----// IR Dump After IREEImportPublic //----- //
#map = affine_map<(d0) -> (d0)>
module {
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = linalg.init_tensor [4] : tensor<4xf32>
%1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<4xf32>, tensor<4xf32>) outs(%0 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%2 = arith.mulf %arg2, %arg3 : f32
linalg.yield %2 : f32
} -> tensor<4xf32>
return %1 : tensor<4xf32>
}
}
// -----// IR Dump After SanitizeModuleNames //----- //
#map = affine_map<(d0) -> (d0)>
module {
func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = linalg.init_tensor [4] : tensor<4xf32>
%1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<4xf32>, tensor<4xf32>) outs(%0 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%2 = arith.mulf %arg2, %arg3 : f32
linalg.yield %2 : f32
} -> tensor<4xf32>
return %1 : tensor<4xf32>
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = call @_simple_mul(%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
func.func private @_simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = linalg.init_tensor [4] : tensor<4xf32>
%1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<4xf32>, tensor<4xf32>) outs(%0 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%2 = arith.mulf %arg2, %arg3 : f32
linalg.yield %2 : f32
} -> tensor<4xf32>
return %1 : tensor<4xf32>
}
}
// -----// IR Dump After Canonicalizer //----- //
func.func private @_simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
%0 = linalg.init_tensor [4] : tensor<4xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<4xf32>, tensor<4xf32>) outs(%0 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%2 = arith.mulf %arg2, %arg3 : f32
linalg.yield %2 : f32
} -> tensor<4xf32>
return %1 : tensor<4xf32>
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = call @_simple_mul(%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After Inliner //----- //
#map = affine_map<(d0) -> (d0)>
module {
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After SymbolDCE //----- //
#map = affine_map<(d0) -> (d0)>
module {
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::DemoteF64ToF32Pass //----- //
#map = affine_map<(d0) -> (d0)>
module {
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After ConvertConv2D1x1ConvToMatmul //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After VerifyInputLegality //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After LinalgNamedOpConversion //----- //
#map = affine_map<(d0) -> (d0)>
module {
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After ExpandTensorShapes //----- //
#map = affine_map<(d0) -> (d0)>
module {
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
#map = affine_map<(d0) -> (d0)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
#map = affine_map<(d0) -> (d0)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FixedPointIteratorPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After PadTensorToSubTensorInsert //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After ConvertElementwiseToLinalg //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After LinalgFoldUnitExtentDims //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After InterchangeGenericOps //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After ResolveShapedTypeResultDims //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After FusionOfTensorOps //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After SplitReduction //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After InterchangeGenericOps //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%5 = arith.mulf %arg2, %arg3 : f32
linalg.yield %5 : f32
} -> tensor<4xf32>
%4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
// -----// IR Dump After DispatchLinalgOnTensors //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch.workgroups[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> =
(%arg2: !flow.dispatch.tensor<readonly:4xf32>, %arg3: !flow.dispatch.tensor<readonly:4xf32>, %arg4: !flow.dispatch.tensor<writeonly:4xf32>) {
%4 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = flow.dispatch.tensor.load %arg3, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%6 = linalg.init_tensor [4] : tensor<4xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4, %5 : tensor<4xf32>, tensor<4xf32>) outs(%6 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32):
%8 = arith.mulf %arg5, %arg6 : f32
linalg.yield %8 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %7, %arg4, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
flow.return
}
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After CaptureDispatchDynamicDims //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch.workgroups[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> =
(%arg2: !flow.dispatch.tensor<readonly:4xf32>, %arg3: !flow.dispatch.tensor<readonly:4xf32>, %arg4: !flow.dispatch.tensor<writeonly:4xf32>) {
%4 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = flow.dispatch.tensor.load %arg3, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%6 = linalg.init_tensor [4] : tensor<4xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4, %5 : tensor<4xf32>, tensor<4xf32>) outs(%6 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32):
%8 = arith.mulf %arg5, %arg6 : f32
linalg.yield %8 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %7, %arg4, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
flow.return
}
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch.workgroups[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> =
(%arg2: !flow.dispatch.tensor<readonly:4xf32>, %arg3: !flow.dispatch.tensor<readonly:4xf32>, %arg4: !flow.dispatch.tensor<writeonly:4xf32>) {
%4 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = flow.dispatch.tensor.load %arg3, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%6 = linalg.init_tensor [4] : tensor<4xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4, %5 : tensor<4xf32>, tensor<4xf32>) outs(%6 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32):
%8 = arith.mulf %arg5, %arg6 : f32
linalg.yield %8 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %7, %arg4, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
flow.return
}
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch.workgroups[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> =
(%arg2: !flow.dispatch.tensor<readonly:4xf32>, %arg3: !flow.dispatch.tensor<readonly:4xf32>, %arg4: !flow.dispatch.tensor<writeonly:4xf32>) {
%4 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = flow.dispatch.tensor.load %arg3, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%6 = linalg.init_tensor [4] : tensor<4xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4, %5 : tensor<4xf32>, tensor<4xf32>) outs(%6 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32):
%8 = arith.mulf %arg5, %arg6 : f32
linalg.yield %8 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %7, %arg4, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
flow.return
}
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After InitializeEmptyTensors //----- //
#map = affine_map<(d0) -> (d0)>
module {
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch.workgroups[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> =
(%arg2: !flow.dispatch.tensor<readonly:4xf32>, %arg3: !flow.dispatch.tensor<readonly:4xf32>, %arg4: !flow.dispatch.tensor<writeonly:4xf32>) {
%4 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = flow.dispatch.tensor.load %arg3, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%6 = linalg.init_tensor [4] : tensor<4xf32>
%7 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%4, %5 : tensor<4xf32>, tensor<4xf32>) outs(%6 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32):
%8 = arith.mulf %arg5, %arg6 : f32
linalg.yield %8 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %7, %arg4, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
flow.return
}
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After OutlineDispatchRegions //----- //
#map = affine_map<(d0) -> (d0)>
module {
flow.executable private @simple_mul_dispatch_0 {
flow.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%4 = arith.mulf %arg3, %arg4 : f32
linalg.yield %4 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::StripDebugOpsPass //----- //
flow.executable private @simple_mul_dispatch_0 {
flow.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%4 = arith.mulf %arg3, %arg4 : f32
linalg.yield %4 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After DeduplicateExecutables //----- //
#map = affine_map<(d0) -> (d0)>
module {
flow.executable private @simple_mul_dispatch_0 {
flow.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%4 = arith.mulf %arg3, %arg4 : f32
linalg.yield %4 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
flow.executable private @simple_mul_dispatch_0 {
flow.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%4 = arith.mulf %arg3, %arg4 : f32
linalg.yield %4 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
// -----// IR Dump After CSE //----- //
flow.executable private @simple_mul_dispatch_0 {
flow.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%4 = arith.mulf %arg3, %arg4 : f32
linalg.yield %4 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
// -----// IR Dump After CleanupTensorShapes //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After SymbolDCE //----- //
#map = affine_map<(d0) -> (d0)>
module {
flow.executable private @simple_mul_dispatch_0 {
flow.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%4 = arith.mulf %arg3, %arg4 : f32
linalg.yield %4 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyInput //----- //
#map = affine_map<(d0) -> (d0)>
module {
flow.executable private @simple_mul_dispatch_0 {
flow.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%4 = arith.mulf %arg3, %arg4 : f32
linalg.yield %4 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After OutlineConstants //----- //
#map = affine_map<(d0) -> (d0)>
module {
flow.executable private @simple_mul_dispatch_0 {
flow.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%4 = arith.mulf %arg3, %arg4 : f32
linalg.yield %4 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
#map = affine_map<(d0) -> (d0)>
module {
flow.executable private @simple_mul_dispatch_0 {
flow.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%4 = arith.mulf %arg3, %arg4 : f32
linalg.yield %4 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After CSE //----- //
#map = affine_map<(d0) -> (d0)>
module {
flow.executable private @simple_mul_dispatch_0 {
flow.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%4 = arith.mulf %arg3, %arg4 : f32
linalg.yield %4 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
flow.executable private @simple_mul_dispatch_0 {
flow.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%4 = arith.mulf %arg3, %arg4 : f32
linalg.yield %4 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
flow.executable private @simple_mul_dispatch_0 {
flow.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%4 = arith.mulf %arg3, %arg4 : f32
linalg.yield %4 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
flow.executable private @simple_mul_dispatch_0 {
flow.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%2 = linalg.init_tensor [4] : tensor<4xf32>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%4 = arith.mulf %arg3, %arg4 : f32
linalg.yield %4 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
%2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
%3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After ConvertToStream //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c4_0 = arith.constant 4 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4_0]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.sizeof tensor<4xf32> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
%c553648160_i32_1 = arith.constant 553648160 : i32
%c1_i32_2 = arith.constant 1 : i32
%c4_3 = arith.constant 4 : index
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4_3]) type(%c553648160_i32_1) encoding(%c1_i32_2)
%3 = stream.tensor.sizeof tensor<4xf32> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
%6 = stream.tensor.sizeof tensor<4xf32> : index
%7 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%2, %5) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6}
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
%9 = stream.tensor.export %8 : tensor<4xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
return %9 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyLoweringToTensors //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c4_0 = arith.constant 4 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4_0]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.sizeof tensor<4xf32> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
%c553648160_i32_1 = arith.constant 553648160 : i32
%c1_i32_2 = arith.constant 1 : i32
%c4_3 = arith.constant 4 : index
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4_3]) type(%c553648160_i32_1) encoding(%c1_i32_2)
%3 = stream.tensor.sizeof tensor<4xf32> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
%6 = stream.tensor.sizeof tensor<4xf32> : index
%7 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%2, %5) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6}
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
%9 = stream.tensor.export %8 : tensor<4xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
return %9 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.sizeof tensor<4xf32> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%3 = stream.tensor.sizeof tensor<4xf32> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
%6 = stream.tensor.sizeof tensor<4xf32> : index
%7 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%2, %5) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6}
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
%9 = stream.tensor.export %8 : tensor<4xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
return %9 : !hal.buffer_view
}
}
// -----// IR Dump After CSE //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.sizeof tensor<4xf32> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%3 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
%4 = stream.async.transfer %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
%5 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%2, %4) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0}
%6 = stream.async.transfer %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%7 = stream.tensor.export %6 : tensor<4xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.sizeof tensor<4xf32> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%3 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
%4 = stream.async.transfer %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
%5 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%2, %4) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0}
%6 = stream.async.transfer %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%7 = stream.tensor.export %6 : tensor<4xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
return %7 : !hal.buffer_view
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.sizeof tensor<4xf32> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%3 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
%4 = stream.async.transfer %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
%5 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%2, %4) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0}
%6 = stream.async.transfer %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%7 = stream.tensor.export %6 : tensor<4xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.sizeof tensor<4xf32> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%3 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
%4 = stream.async.transfer %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
%5 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%2, %4) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0}
%6 = stream.async.transfer %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%7 = stream.tensor.export %6 : tensor<4xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.sizeof tensor<4xf32> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%3 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
%4 = stream.async.transfer %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
%5 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%2, %4) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0}
%6 = stream.async.transfer %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%7 = stream.tensor.export %6 : tensor<4xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::CombineInitializersPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.sizeof tensor<4xf32> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%3 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
%4 = stream.async.transfer %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
%5 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%2, %4) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0}
%6 = stream.async.transfer %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%7 = stream.tensor.export %6 : tensor<4xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After EncodeDeviceTensors //----- //
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
// -----// IR Dump After EncodeHostTensors //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
%4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
%6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
// -----// IR Dump After MaterializeBuiltins //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
%4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
%6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
%4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
%6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After CSE //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
%4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
%6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
%4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
%6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
%4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
%6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
%4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
%6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
%4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
%6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After MaterializeCopyOnWrite //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
%4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
%6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
// -----// IR Dump After ElideAsyncCopies //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
%4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
%6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After RefineUsage //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
%3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After ScheduleExecution //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
%4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
stream.yield %4 : !stream.resource<external>{%c16}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After ScheduleConcurrency //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
%4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
stream.yield %4 : !stream.resource<external>{%c16}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After PropagateTimepoints //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.timepoint.immediate => !stream.timepoint
%3 = stream.timepoint.immediate => !stream.timepoint
%4 = stream.timepoint.immediate => !stream.timepoint
%results, %result_timepoint = stream.async.execute await(%4) => with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
%7 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
stream.yield %7 : !stream.resource<external>{%c16}
} => !stream.timepoint
%5 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
%4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
stream.yield %4 : !stream.resource<external>{%c16}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After CSE //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
%4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
stream.yield %4 : !stream.resource<external>{%c16}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
%4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
stream.yield %4 : !stream.resource<external>{%c16}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
%4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
stream.yield %4 : !stream.resource<external>{%c16}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
%4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
stream.yield %4 : !stream.resource<external>{%c16}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
%4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
stream.yield %4 : !stream.resource<external>{%c16}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyLoweringToAsync //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
%4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
stream.yield %4 : !stream.resource<external>{%c16}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
%3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After ScheduleAllocation //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%c0 = arith.constant 0 : index
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After PackConstants //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%c0 = arith.constant 0 : index
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After PackAllocations //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%c0 = arith.constant 0 : index
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After LayoutSlices //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%c0 = arith.constant 0 : index
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After PropagateSubviews //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%c0 = arith.constant 0 : index
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After CSE //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyLoweringToCmd //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After CSE //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After FuseDispatchBindings //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index) {
%c0 = arith.constant 0 : index
%0 = arith.addi %c0, %arg3 : index
%1 = stream.binding.subspan %arg0[%0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = arith.addi %c0, %arg4 : index
%3 = stream.binding.subspan %arg1[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%4 = arith.addi %c0, %arg5 : index
%5 = stream.binding.subspan %arg2[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %3, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = linalg.init_tensor [4] : tensor<4xf32>
%9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%8 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg6: f32, %arg7: f32, %arg8: f32):
%10 = arith.mulf %arg6, %arg7 : f32
linalg.yield %10 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %9, %5, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%c0_0 = arith.constant 0 : index
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%c0, %c0, %c0 : index, index, index) {
ro %arg2[%c0_0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0_0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0_0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After PackDispatchOperands //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: i32, %arg4: i32, %arg5: i32) {
%0 = arith.index_cast %arg3 : i32 to index
%1 = arith.index_cast %arg4 : i32 to index
%2 = arith.index_cast %arg5 : i32 to index
%c0 = arith.constant 0 : index
%3 = arith.addi %c0, %0 : index
%4 = stream.binding.subspan %arg0[%3] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%5 = arith.addi %c0, %1 : index
%6 = stream.binding.subspan %arg1[%5] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%7 = arith.addi %c0, %2 : index
%8 = stream.binding.subspan %arg2[%7] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%9 = flow.dispatch.tensor.load %4, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%10 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%11 = linalg.init_tensor [4] : tensor<4xf32>
%12 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%9, %10 : tensor<4xf32>, tensor<4xf32>) outs(%11 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg6: f32, %arg7: f32, %arg8: f32):
%13 = arith.mulf %arg6, %arg7 : f32
linalg.yield %13 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %12, %8, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%c0_0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%c0_i32_1 = arith.constant 0 : i32
%c0_i32_2 = arith.constant 0 : i32
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%c0_i32, %c0_i32_1, %c0_i32_2 : i32, i32, i32) {
ro %arg2[%c0_0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0_0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0_0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After CSE //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: i32, %arg4: i32, %arg5: i32) {
%0 = arith.index_cast %arg3 : i32 to index
%1 = arith.index_cast %arg4 : i32 to index
%2 = arith.index_cast %arg5 : i32 to index
%c0 = arith.constant 0 : index
%3 = arith.addi %c0, %0 : index
%4 = stream.binding.subspan %arg0[%3] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%5 = arith.addi %c0, %1 : index
%6 = stream.binding.subspan %arg1[%5] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%7 = arith.addi %c0, %2 : index
%8 = stream.binding.subspan %arg2[%7] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%9 = flow.dispatch.tensor.load %4, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%10 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%11 = linalg.init_tensor [4] : tensor<4xf32>
%12 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%9, %10 : tensor<4xf32>, tensor<4xf32>) outs(%11 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg6: f32, %arg7: f32, %arg8: f32):
%13 = arith.mulf %arg6, %arg7 : f32
linalg.yield %13 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %12, %8, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%c0_i32, %c0_i32, %c0_i32 : i32, i32, i32) {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After FoldUniformOperands //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%0 = arith.index_cast %c0_i32 : i32 to index
%1 = arith.index_cast %c0_i32 : i32 to index
%2 = arith.index_cast %c0_i32 : i32 to index
%c0 = arith.constant 0 : index
%3 = arith.addi %c0, %0 : index
%4 = stream.binding.subspan %arg0[%3] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%5 = arith.addi %c0, %1 : index
%6 = stream.binding.subspan %arg1[%5] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%7 = arith.addi %c0, %2 : index
%8 = stream.binding.subspan %arg2[%7] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%9 = flow.dispatch.tensor.load %4, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%10 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%11 = linalg.init_tensor [4] : tensor<4xf32>
%12 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%9, %10 : tensor<4xf32>, tensor<4xf32>) outs(%11 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%13 = arith.mulf %arg3, %arg4 : f32
linalg.yield %13 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %12, %8, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After AnnotateDispatchArguments //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0_i32 = arith.constant 0 : i32
%0 = arith.index_cast %c0_i32 : i32 to index
%1 = arith.index_cast %c0_i32 : i32 to index
%2 = arith.index_cast %c0_i32 : i32 to index
%c0 = arith.constant 0 : index
%3 = arith.addi %c0, %0 : index
%4 = stream.binding.subspan %arg0[%3] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%5 = arith.addi %c0, %1 : index
%6 = stream.binding.subspan %arg1[%5] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%7 = arith.addi %c0, %2 : index
%8 = stream.binding.subspan %arg2[%7] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%9 = flow.dispatch.tensor.load %4, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%10 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%11 = linalg.init_tensor [4] : tensor<4xf32>
%12 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%9, %10 : tensor<4xf32>, tensor<4xf32>) outs(%11 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%13 = arith.mulf %arg3, %arg4 : f32
linalg.yield %13 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %12, %8, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After CSE //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After SymbolDCE //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After CSE //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
#map = affine_map<(d0) -> (d0)>
module {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::AssignTargetDevicesPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#map = affine_map<(d0) -> (d0)>
module attributes {hal.device.targets = [#device_target_cpu]} {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::VerifyTargetEnvironmentPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#map = affine_map<(d0) -> (d0)>
module attributes {hal.device.targets = [#device_target_cpu]} {
stream.executable private @simple_mul_dispatch_0 {
stream.executable.export public @simple_mul_dispatch_0
builtin.module {
func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%7 = arith.mulf %arg3, %arg4 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::MaterializeInterfacesPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0) -> (d0)>
module attributes {hal.device.targets = [#device_target_cpu]} {
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout)
builtin.module {
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
%7 = arith.mulf %arg0, %arg1 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]}
} => !stream.timepoint
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
%5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyLinalgTransformLegality //----- //
module {
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
%7 = arith.mulf %arg0, %arg1 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
// -----// IR Dump After TypePropagation //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
%7 = arith.mulf %arg0, %arg1 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
// -----// IR Dump After BufferizeCopyOnlyDispatches //----- //
module {
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
%7 = arith.mulf %arg0, %arg1 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
// -----// IR Dump After InsertDistributionInfo //----- //
hal.executable.variant public @embedded_elf_x86_64, target = <"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>) {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 4)>()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module {
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%5 = linalg.init_tensor [4] : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
%7 = arith.mulf %arg0, %arg1 : f32
linalg.yield %7 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
return
}
}
}
// -----// IR Dump After TileAndDistributeToWorkgroups //----- //
func.func @simple_mul_dispatch_0() {
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [%c1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [%c1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = linalg.init_tensor [4] : tensor<4xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5, %6 : tensor<4xf32>, tensor<4xf32>) outs(%7 : tensor<4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
%9 = arith.mulf %arg1, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After ConvertToDestinationPassingStyle //----- //
func.func @simple_mul_dispatch_0() {
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%c1_0 = arith.constant 1 : index
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [%c1_0] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [%c1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [%c1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = linalg.init_tensor [4] : tensor<4xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
%10 = arith.mulf %arg1, %arg2 : f32
linalg.yield %10 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %9, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After FoldAffineMinInDistributedLoops //----- //
func.func @simple_mul_dispatch_0() {
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [%c1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [%c1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [%c1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
%9 = arith.mulf %arg1, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After Canonicalizer //----- //
module {
func.func @simple_mul_dispatch_0() {
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
%9 = arith.mulf %arg1, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
}
// -----// IR Dump After CSE //----- //
module {
func.func @simple_mul_dispatch_0() {
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
%9 = arith.mulf %arg1, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
}
// -----// IR Dump After LinalgStrategyTileAndFusePass //----- //
func.func @simple_mul_dispatch_0() {
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = scf.for %arg1 = %c0 to %c4 step %c4 iter_args(%arg2 = %5) -> (tensor<4xf32>) {
%9 = tensor.extract_slice %6[%arg1] [4] [1] : tensor<4xf32> to tensor<4xf32>
%10 = tensor.extract_slice %7[%arg1] [4] [1] : tensor<4xf32> to tensor<4xf32>
%11 = tensor.extract_slice %arg2[%arg1] [4] [1] : tensor<4xf32> to tensor<4xf32>
%12 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %10 : tensor<4xf32>, tensor<4xf32>) outs(%11 : tensor<4xf32>) attrs = {__internal_linalg_transform__ = "1", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%14 = arith.mulf %arg3, %arg4 : f32
linalg.yield %14 : f32
} -> tensor<4xf32>
%13 = tensor.insert_slice %12 into %arg2[%arg1] [4] [1] : tensor<4xf32> into tensor<4xf32>
scf.yield %13 : tensor<4xf32>
}
flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {__internal_linalg_transform__ = "1", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
%9 = arith.mulf %arg1, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass //----- //
func.func @simple_mul_dispatch_0() {
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {__internal_linalg_transform__ = "1", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
%9 = arith.mulf %arg1, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass //----- //
func.func @simple_mul_dispatch_0() {
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
%9 = arith.mulf %arg1, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After LinalgFuse //----- //
func.func @simple_mul_dispatch_0() {
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
%9 = arith.mulf %arg1, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul_dispatch_0() {
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
%9 = arith.mulf %arg1, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
%9 = arith.mulf %arg1, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After LinalgStrategyTilePass //----- //
func.func @simple_mul_dispatch_0() {
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {__internal_linalg_transform__ = "1", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
%9 = arith.mulf %arg1, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {__internal_linalg_transform__ = "1", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
%9 = arith.mulf %arg1, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass //----- //
func.func @simple_mul_dispatch_0() {
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {__internal_linalg_transform__ = "1", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
%9 = arith.mulf %arg1, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<4xf32>
flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After LinalgStrategyVectorizePass //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = vector.transfer_read %6[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
%9 = vector.transfer_read %7[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
%10 = arith.mulf %8, %9 : vector<4xf32>
%11 = vector.transfer_write %10, %5[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
flow.dispatch.tensor.store %11, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = vector.transfer_read %6[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
%9 = vector.transfer_read %7[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
%10 = arith.mulf %8, %9 : vector<4xf32>
%11 = vector.transfer_write %10, %5[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
flow.dispatch.tensor.store %11, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = vector.transfer_read %6[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
%9 = vector.transfer_read %7[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
%10 = arith.mulf %8, %9 : vector<4xf32>
%11 = vector.transfer_write %10, %5[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
flow.dispatch.tensor.store %11, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = vector.transfer_read %6[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
%9 = vector.transfer_read %7[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
%10 = arith.mulf %8, %9 : vector<4xf32>
%11 = vector.transfer_write %10, %5[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
flow.dispatch.tensor.store %11, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After LinalgSingleTilingExpert //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = vector.transfer_read %6[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
%9 = vector.transfer_read %7[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
%10 = arith.mulf %8, %9 : vector<4xf32>
%11 = vector.transfer_write %10, %5[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
flow.dispatch.tensor.store %11, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = vector.transfer_read %6[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
%9 = vector.transfer_read %7[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
%10 = arith.mulf %8, %9 : vector<4xf32>
%11 = vector.transfer_write %10, %5[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
flow.dispatch.tensor.store %11, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = vector.transfer_read %6[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
%9 = vector.transfer_read %7[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
%10 = arith.mulf %8, %9 : vector<4xf32>
%11 = vector.transfer_write %10, %5[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
flow.dispatch.tensor.store %11, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
// -----// IR Dump After LinalgInitTensorToAllocTensor //----- //
module {
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
%8 = vector.transfer_read %6[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
%9 = vector.transfer_read %7[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
%10 = arith.mulf %8, %9 : vector<4xf32>
%11 = vector.transfer_write %10, %5[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
flow.dispatch.tensor.store %11, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
}
return
}
}
// -----// IR Dump After IREEComprehensiveBufferize //----- //
module {
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %4, 64 : memref<4xf32>
%5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%7 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %6 to %c4 step %7 {
%8 = bufferization.to_tensor %4 : memref<4xf32>
%9 = bufferization.to_tensor %0 : memref<4xf32>
%10 = bufferization.to_tensor %2 : memref<4xf32>
%11 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%12 = vector.transfer_read %2[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%13 = arith.mulf %11, %12 : vector<4xf32>
vector.transfer_write %13, %4[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
%14 = bufferization.to_tensor %4 : memref<4xf32>
linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4 : memref<4xf32>) outs(%4 : memref<4xf32>) {
^bb0(%arg1: f32, %arg2: f32):
linalg.yield %arg1 : f32
}
}
return
}
}
// -----// IR Dump After ResolveShapedTypeResultDims //----- //
module {
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %4, 64 : memref<4xf32>
%5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%7 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %6 to %c4 step %7 {
%8 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%9 = vector.transfer_read %2[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%10 = arith.mulf %8, %9 : vector<4xf32>
vector.transfer_write %10, %4[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4 : memref<4xf32>) outs(%4 : memref<4xf32>) {
^bb0(%arg1: f32, %arg2: f32):
linalg.yield %arg1 : f32
}
}
return
}
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %4, 64 : memref<4xf32>
%5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%7 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %6 to %c4 step %7 {
%8 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%9 = vector.transfer_read %2[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%10 = arith.mulf %8, %9 : vector<4xf32>
vector.transfer_write %10, %4[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
}
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %4, 64 : memref<4xf32>
%5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%7 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %6 to %c4 step %7 {
%8 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%9 = vector.transfer_read %2[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%10 = arith.mulf %8, %9 : vector<4xf32>
vector.transfer_write %10, %4[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
}
return
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
%4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %4, 64 : memref<4xf32>
%5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%7 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %6 to %c4 step %7 {
%8 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%9 = vector.transfer_read %2[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%10 = arith.mulf %8, %9 : vector<4xf32>
vector.transfer_write %10, %4[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
}
return
}
// -----// IR Dump After CleanupBufferAllocView //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c4 step %4 {
%5 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%6 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%7 = arith.mulf %5, %6 : vector<4xf32>
vector.transfer_write %7, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
}
return
}
// -----// IR Dump After RemoveSingleIterationLoop //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After LinalgStrategyLowerVectorsPass //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After LinalgStrategyEnablePass //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After LinalgVectorLowering //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After LinalgStrategyLowerVectorsPass //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After LinalgStrategyEnablePass //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After LinalgVectorLowering //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After LinalgStrategyLowerVectorsPass //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After LinalgStrategyEnablePass //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After LinalgVectorLowering //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
return
}
// -----// IR Dump After LinalgStrategyLowerVectorsPass //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After LinalgStrategyEnablePass //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After LinalgVectorLowering //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After LinalgStrategyLowerVectorsPass //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After LinalgStrategyEnablePass //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After LinalgVectorLowering //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After LinalgStrategyLowerVectorsPass //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After LinalgStrategyEnablePass //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After LinalgVectorLowering //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After LinalgStrategyLowerVectorsPass //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After LinalgStrategyEnablePass //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After LinalgVectorLowering //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After LLVMCPULowerExecutableTarget //----- //
hal.executable.variant public @embedded_elf_x86_64, target = <"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>) {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 4)>()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module {
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
}
}
// -----// IR Dump After LinalgExtToLoops //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After MemrefCopyToLinalgPass //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After LinalgLowerToLoops //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After ArithmeticBufferize //----- //
module {
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
}
// -----// IR Dump After FoldTensorExtractOp //----- //
module {
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
}
// -----// IR Dump After PolynomialApproximationPass //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After LLVMCPUCheckIRBeforeLLVMConversion //----- //
module {
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
}
// -----// IR Dump After SCFToControlFlow //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After CSE //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After ArithmeticExpandOps //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After ExpandOps //----- //
func.func @simple_mul_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
%4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
%5 = arith.mulf %3, %4 : vector<4xf32>
vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
return
}
// -----// IR Dump After ConvertToLLVM //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 {
%0 = llvm.mlir.constant(0 : index) : i64
%1 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%2 = llvm.extractvalue %1[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%3 = llvm.mlir.constant(0 : i64) : i64
%4 = llvm.load %2 : !llvm.ptr<ptr<i8>>
%5 = llvm.bitcast %4 : !llvm.ptr<i8> to !llvm.ptr<f32>
%6 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%7 = llvm.insertvalue %5, %6[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%8 = llvm.insertvalue %5, %7[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%9 = llvm.mlir.constant(0 : index) : i64
%10 = llvm.insertvalue %9, %8[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%11 = llvm.mlir.constant(4 : index) : i64
%12 = llvm.insertvalue %11, %10[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%13 = llvm.mlir.constant(1 : index) : i64
%14 = llvm.insertvalue %13, %12[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%15 = llvm.extractvalue %14[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%16 = llvm.mlir.constant(0 : index) : i64
%17 = llvm.mlir.constant(63 : index) : i64
%18 = llvm.ptrtoint %15 : !llvm.ptr<f32> to i64
%19 = llvm.and %18, %17 : i64
%20 = llvm.icmp "eq" %19, %16 : i64
"llvm.intr.assume"(%20) : (i1) -> ()
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.mlir.constant(1 : i64) : i64
%24 = llvm.getelementptr %22[%23] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%25 = llvm.load %24 : !llvm.ptr<ptr<i8>>
%26 = llvm.bitcast %25 : !llvm.ptr<i8> to !llvm.ptr<f32>
%27 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%28 = llvm.insertvalue %26, %27[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%29 = llvm.insertvalue %26, %28[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%30 = llvm.mlir.constant(0 : index) : i64
%31 = llvm.insertvalue %30, %29[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%32 = llvm.mlir.constant(4 : index) : i64
%33 = llvm.insertvalue %32, %31[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%34 = llvm.mlir.constant(1 : index) : i64
%35 = llvm.insertvalue %34, %33[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%36 = llvm.extractvalue %35[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%37 = llvm.mlir.constant(0 : index) : i64
%38 = llvm.mlir.constant(63 : index) : i64
%39 = llvm.ptrtoint %36 : !llvm.ptr<f32> to i64
%40 = llvm.and %39, %38 : i64
%41 = llvm.icmp "eq" %40, %37 : i64
"llvm.intr.assume"(%41) : (i1) -> ()
%42 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%43 = llvm.extractvalue %42[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%44 = llvm.mlir.constant(2 : i64) : i64
%45 = llvm.getelementptr %43[%44] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%46 = llvm.load %45 : !llvm.ptr<ptr<i8>>
%47 = llvm.bitcast %46 : !llvm.ptr<i8> to !llvm.ptr<f32>
%48 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%49 = llvm.insertvalue %47, %48[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%50 = llvm.insertvalue %47, %49[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%51 = llvm.mlir.constant(0 : index) : i64
%52 = llvm.insertvalue %51, %50[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%53 = llvm.mlir.constant(4 : index) : i64
%54 = llvm.insertvalue %53, %52[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%55 = llvm.mlir.constant(1 : index) : i64
%56 = llvm.insertvalue %55, %54[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%57 = llvm.extractvalue %56[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%58 = llvm.mlir.constant(0 : index) : i64
%59 = llvm.mlir.constant(63 : index) : i64
%60 = llvm.ptrtoint %57 : !llvm.ptr<f32> to i64
%61 = llvm.and %60, %59 : i64
%62 = llvm.icmp "eq" %61, %58 : i64
"llvm.intr.assume"(%62) : (i1) -> ()
%63 = llvm.extractvalue %14[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%64 = llvm.getelementptr %63[%0] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%65 = llvm.bitcast %64 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%66 = llvm.load %65 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%67 = llvm.extractvalue %35[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%68 = llvm.getelementptr %67[%0] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%69 = llvm.bitcast %68 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%70 = llvm.load %69 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%71 = llvm.fmul %66, %70 : vector<4xf32>
%72 = llvm.extractvalue %56[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%73 = llvm.getelementptr %72[%0] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%74 = llvm.bitcast %73 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %71, %74 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%75 = llvm.mlir.constant(0 : i32) : i32
llvm.return %75 : i32
}
}
// -----// IR Dump After ReconcileUnrealizedCasts //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 {
%0 = llvm.mlir.constant(0 : index) : i64
%1 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%2 = llvm.extractvalue %1[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%3 = llvm.mlir.constant(0 : i64) : i64
%4 = llvm.load %2 : !llvm.ptr<ptr<i8>>
%5 = llvm.bitcast %4 : !llvm.ptr<i8> to !llvm.ptr<f32>
%6 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%7 = llvm.insertvalue %5, %6[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%8 = llvm.insertvalue %5, %7[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%9 = llvm.mlir.constant(0 : index) : i64
%10 = llvm.insertvalue %9, %8[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%11 = llvm.mlir.constant(4 : index) : i64
%12 = llvm.insertvalue %11, %10[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%13 = llvm.mlir.constant(1 : index) : i64
%14 = llvm.insertvalue %13, %12[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%15 = llvm.mlir.constant(0 : index) : i64
%16 = llvm.mlir.constant(63 : index) : i64
%17 = llvm.ptrtoint %5 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %16 : i64
%19 = llvm.icmp "eq" %18, %15 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.mlir.constant(1 : i64) : i64
%23 = llvm.getelementptr %21[%22] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%24 = llvm.load %23 : !llvm.ptr<ptr<i8>>
%25 = llvm.bitcast %24 : !llvm.ptr<i8> to !llvm.ptr<f32>
%26 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%27 = llvm.insertvalue %25, %26[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%28 = llvm.insertvalue %25, %27[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%29 = llvm.mlir.constant(0 : index) : i64
%30 = llvm.insertvalue %29, %28[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%31 = llvm.mlir.constant(4 : index) : i64
%32 = llvm.insertvalue %31, %30[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%33 = llvm.mlir.constant(1 : index) : i64
%34 = llvm.insertvalue %33, %32[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%35 = llvm.mlir.constant(0 : index) : i64
%36 = llvm.mlir.constant(63 : index) : i64
%37 = llvm.ptrtoint %25 : !llvm.ptr<f32> to i64
%38 = llvm.and %37, %36 : i64
%39 = llvm.icmp "eq" %38, %35 : i64
"llvm.intr.assume"(%39) : (i1) -> ()
%40 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%41 = llvm.extractvalue %40[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%42 = llvm.mlir.constant(2 : i64) : i64
%43 = llvm.getelementptr %41[%42] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%44 = llvm.load %43 : !llvm.ptr<ptr<i8>>
%45 = llvm.bitcast %44 : !llvm.ptr<i8> to !llvm.ptr<f32>
%46 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%47 = llvm.insertvalue %45, %46[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%48 = llvm.insertvalue %45, %47[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%49 = llvm.mlir.constant(0 : index) : i64
%50 = llvm.insertvalue %49, %48[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%51 = llvm.mlir.constant(4 : index) : i64
%52 = llvm.insertvalue %51, %50[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%53 = llvm.mlir.constant(1 : index) : i64
%54 = llvm.insertvalue %53, %52[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%55 = llvm.mlir.constant(0 : index) : i64
%56 = llvm.mlir.constant(63 : index) : i64
%57 = llvm.ptrtoint %45 : !llvm.ptr<f32> to i64
%58 = llvm.and %57, %56 : i64
%59 = llvm.icmp "eq" %58, %55 : i64
"llvm.intr.assume"(%59) : (i1) -> ()
%60 = llvm.bitcast %5 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%61 = llvm.load %60 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%62 = llvm.bitcast %25 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%63 = llvm.load %62 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%64 = llvm.fmul %61, %63 : vector<4xf32>
%65 = llvm.bitcast %45 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %64, %65 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%66 = llvm.mlir.constant(0 : i32) : i32
llvm.return %66 : i32
}
}
// -----// IR Dump After LLVMCPUSynchronizeSymbolVisibility //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : index) : i64
%1 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%2 = llvm.extractvalue %1[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%3 = llvm.mlir.constant(0 : i64) : i64
%4 = llvm.load %2 : !llvm.ptr<ptr<i8>>
%5 = llvm.bitcast %4 : !llvm.ptr<i8> to !llvm.ptr<f32>
%6 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%7 = llvm.insertvalue %5, %6[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%8 = llvm.insertvalue %5, %7[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%9 = llvm.mlir.constant(0 : index) : i64
%10 = llvm.insertvalue %9, %8[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%11 = llvm.mlir.constant(4 : index) : i64
%12 = llvm.insertvalue %11, %10[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%13 = llvm.mlir.constant(1 : index) : i64
%14 = llvm.insertvalue %13, %12[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%15 = llvm.mlir.constant(0 : index) : i64
%16 = llvm.mlir.constant(63 : index) : i64
%17 = llvm.ptrtoint %5 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %16 : i64
%19 = llvm.icmp "eq" %18, %15 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.mlir.constant(1 : i64) : i64
%23 = llvm.getelementptr %21[%22] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%24 = llvm.load %23 : !llvm.ptr<ptr<i8>>
%25 = llvm.bitcast %24 : !llvm.ptr<i8> to !llvm.ptr<f32>
%26 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%27 = llvm.insertvalue %25, %26[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%28 = llvm.insertvalue %25, %27[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%29 = llvm.mlir.constant(0 : index) : i64
%30 = llvm.insertvalue %29, %28[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%31 = llvm.mlir.constant(4 : index) : i64
%32 = llvm.insertvalue %31, %30[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%33 = llvm.mlir.constant(1 : index) : i64
%34 = llvm.insertvalue %33, %32[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%35 = llvm.mlir.constant(0 : index) : i64
%36 = llvm.mlir.constant(63 : index) : i64
%37 = llvm.ptrtoint %25 : !llvm.ptr<f32> to i64
%38 = llvm.and %37, %36 : i64
%39 = llvm.icmp "eq" %38, %35 : i64
"llvm.intr.assume"(%39) : (i1) -> ()
%40 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%41 = llvm.extractvalue %40[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%42 = llvm.mlir.constant(2 : i64) : i64
%43 = llvm.getelementptr %41[%42] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%44 = llvm.load %43 : !llvm.ptr<ptr<i8>>
%45 = llvm.bitcast %44 : !llvm.ptr<i8> to !llvm.ptr<f32>
%46 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%47 = llvm.insertvalue %45, %46[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%48 = llvm.insertvalue %45, %47[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%49 = llvm.mlir.constant(0 : index) : i64
%50 = llvm.insertvalue %49, %48[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%51 = llvm.mlir.constant(4 : index) : i64
%52 = llvm.insertvalue %51, %50[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%53 = llvm.mlir.constant(1 : index) : i64
%54 = llvm.insertvalue %53, %52[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
%55 = llvm.mlir.constant(0 : index) : i64
%56 = llvm.mlir.constant(63 : index) : i64
%57 = llvm.ptrtoint %45 : !llvm.ptr<f32> to i64
%58 = llvm.and %57, %56 : i64
%59 = llvm.icmp "eq" %58, %55 : i64
"llvm.intr.assume"(%59) : (i1) -> ()
%60 = llvm.bitcast %5 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%61 = llvm.load %60 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%62 = llvm.bitcast %25 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%63 = llvm.load %62 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%64 = llvm.fmul %61, %63 : vector<4xf32>
%65 = llvm.bitcast %45 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %64, %65 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%66 = llvm.mlir.constant(0 : i32) : i32
llvm.return %66 : i32
}
}
// -----// IR Dump After Canonicalizer //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
// -----// IR Dump After CSE //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateTargetExecutableVariantsPass //----- //
hal.executable.variant public @embedded_elf_x86_64, target = <"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>) {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 4)>()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass //----- //
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = <"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>) {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 4)>()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::ConvertToHALPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 ceildiv 4)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
%device_1 = hal.ex.shared_device : !hal.device
%allocator_2 = hal.device.allocator<%device_1 : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%device_3 = hal.ex.shared_device : !hal.device
%allocator_4 = hal.device.allocator<%device_3 : !hal.device> : !hal.allocator
%buffer_5 = hal.allocator.allocate<%allocator_4 : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%device_6 = hal.ex.shared_device : !hal.device
%cmd = hal.command_buffer.create device(%device_6 : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%0 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
hal.device.switch<%0 : !hal.device>
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%executable_layout = hal.executable_layout.lookup device(%0 : !hal.device) layout(#executable_layout) : !hal.executable_layout
%c0_11 = arith.constant 0 : index
%c1_12 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c0_13 = arith.constant 0 : index
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0_13] bindings([
%c0_11 = (%buffer : !hal.buffer)[%c0, %c16],
%c1_12 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_5 : !hal.buffer)[%c0, %c16]
])
%c1_14 = arith.constant 1 : index
%1 = affine.apply #map()[%c4]
hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@simple_mul_dispatch_0::@embedded_elf_x86_64::@simple_mul_dispatch_0) workgroups([%1, %c1_14, %c1_14])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device_6, %cmd
%c0_7 = arith.constant 0 : index
%c4_8 = arith.constant 4 : index
%c553648160_i32_9 = arith.constant 553648160 : i32
%c1_i32_10 = arith.constant 1 : i32
%view = hal.buffer_view.create buffer(%buffer_5 : !hal.buffer) shape([%c4_8]) type(%c553648160_i32_9) encoding(%c1_i32_10) : !hal.buffer_view
return %view : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 ceildiv 4)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
%device_1 = hal.ex.shared_device : !hal.device
%allocator_2 = hal.device.allocator<%device_1 : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%device_3 = hal.ex.shared_device : !hal.device
%allocator_4 = hal.device.allocator<%device_3 : !hal.device> : !hal.allocator
%buffer_5 = hal.allocator.allocate<%allocator_4 : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%device_6 = hal.ex.shared_device : !hal.device
%cmd = hal.command_buffer.create device(%device_6 : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
hal.device.switch<%device_6 : !hal.device>
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%executable_layout = hal.executable_layout.lookup device(%device_6 : !hal.device) layout(#executable_layout) : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_5 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@simple_mul_dispatch_0::@embedded_elf_x86_64::@simple_mul_dispatch_0) workgroups([%c1, %c1, %c1])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device_6, %cmd
%view = hal.buffer_view.create buffer(%buffer_5 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
}
// -----// IR Dump After CSE //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 ceildiv 4)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layout(#executable_layout) : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@simple_mul_dispatch_0::@embedded_elf_x86_64::@simple_mul_dispatch_0) workgroups([%c1, %c1, %c1])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layout(<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>) : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@simple_mul_dispatch_0::@embedded_elf_x86_64::@simple_mul_dispatch_0) workgroups([%c1, %c1, %c1])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 ceildiv 4)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layout(#executable_layout) : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@simple_mul_dispatch_0::@embedded_elf_x86_64::@simple_mul_dispatch_0) workgroups([%c1, %c1, %c1])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 ceildiv 4)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layout(#executable_layout) : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@simple_mul_dispatch_0::@embedded_elf_x86_64::@simple_mul_dispatch_0) workgroups([%c1, %c1, %c1])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 ceildiv 4)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layout(#executable_layout) : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@simple_mul_dispatch_0::@embedded_elf_x86_64::@simple_mul_dispatch_0) workgroups([%c1, %c1, %c1])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::LinkTargetExecutablesPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 ceildiv 4)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layout(#executable_layout) : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@simple_mul_dispatch_0::@embedded_elf_x86_64::@simple_mul_dispatch_0) workgroups([%c1, %c1, %c1])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::LinkExecutablesPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 ceildiv 4)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layout(#executable_layout) : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@simple_mul_dispatch_0::@embedded_elf_x86_64::@simple_mul_dispatch_0) workgroups([%c1, %c1, %c1])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::ResolveExportOrdinalsPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 ceildiv 4)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layout(#executable_layout) : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
%0 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
%exe = hal.executable.lookup device(%0 : !hal.device) executable(@simple_mul_dispatch_0) : !hal.executable
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeResourceCachesPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 ceildiv 4)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_executable_layout_0 : !hal.executable_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
util.initializer.return
}
util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%0 = hal.device.switch<%device : !hal.device> -> !hal.executable
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
hal.return %exe : !hal.executable
},
#hal.match.always {
%1 = util.null : !hal.executable
hal.return %1 : !hal.executable
}
util.global.store %0, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
%0 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass //----- //
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass //----- //
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
util.initializer.return
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass //----- //
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb5(%exe : !hal.executable)
^bb2: // pred: ^bb0
%true = arith.constant true
cf.cond_br %true, ^bb3, ^bb4
^bb3: // pred: ^bb2
%0 = util.null : !hal.executable
cf.br ^bb5(%0 : !hal.executable)
^bb4: // pred: ^bb2
util.unreachable "device not supported in the compiled configuration"
^bb5(%1: !hal.executable): // 2 preds: ^bb1, ^bb3
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
%0 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
cf.br ^bb3
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
^bb3: // pred: ^bb1
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MemoizeDeviceQueriesPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 ceildiv 4)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_device_query_0_ok : i1
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %ok, @_device_query_0_ok : i1
util.global.store %value, @_device_query_0 : i1
util.initializer.return
}
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_executable_layout_0 : !hal.executable_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
util.initializer.return
}
util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%_device_query_0_ok = util.global.load @_device_query_0_ok : i1
%_device_query_0 = util.global.load @_device_query_0 : i1
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb5(%exe : !hal.executable)
^bb2: // pred: ^bb0
%true = arith.constant true
cf.cond_br %true, ^bb3, ^bb4
^bb3: // pred: ^bb2
%0 = util.null : !hal.executable
cf.br ^bb5(%0 : !hal.executable)
^bb4: // pred: ^bb2
util.unreachable "device not supported in the compiled configuration"
^bb5(%1: !hal.executable): // 2 preds: ^bb1, ^bb3
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%_device_query_0_ok = util.global.load @_device_query_0_ok : i1
%_device_query_0 = util.global.load @_device_query_0 : i1
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
%0 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
cf.br ^bb3
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
^bb3: // pred: ^bb1
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 ceildiv 4)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_device_query_0_ok : i1
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %ok, @_device_query_0_ok : i1
util.global.store %value, @_device_query_0 : i1
util.initializer.return
}
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_executable_layout_0 : !hal.executable_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
util.initializer.return
}
util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%_device_query_0 = util.global.load @_device_query_0 : i1
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%_device_query_0 = util.global.load @_device_query_0 : i1
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After CSE //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 ceildiv 4)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_device_query_0_ok : i1
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %ok, @_device_query_0_ok : i1
util.global.store %value, @_device_query_0 : i1
util.initializer.return
}
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_executable_layout_0 : !hal.executable_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
util.initializer.return
}
util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%_device_query_0 = util.global.load @_device_query_0 : i1
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%_device_query_0 = util.global.load @_device_query_0 : i1
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %value, @_device_query_0 : i1
util.global.store %ok, @_device_query_0_ok : i1
util.initializer.return
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
util.initializer.return
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
util.initializer {
%_device_query_0 = util.global.load @_device_query_0 : i1
%device = hal.ex.shared_device : !hal.device
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 ceildiv 4)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_device_query_0_ok : i1
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %value, @_device_query_0 : i1
util.global.store %ok, @_device_query_0_ok : i1
util.initializer.return
}
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_executable_layout_0 : !hal.executable_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
util.initializer.return
}
util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer {
%_device_query_0 = util.global.load @_device_query_0 : i1
%device = hal.ex.shared_device : !hal.device
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c1_i32 = arith.constant 1 : i32
%c553648160_i32 = arith.constant 553648160 : i32
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 ceildiv 4)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
util.global private @_device_query_0 : i1
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %value, @_device_query_0 : i1
util.initializer.return
}
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_executable_layout_0 : !hal.executable_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
util.initializer.return
}
util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer {
%_device_query_0 = util.global.load @_device_query_0 : i1
%device = hal.ex.shared_device : !hal.device
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c1_i32 = arith.constant 1 : i32
%c553648160_i32 = arith.constant 553648160 : i32
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 ceildiv 4)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
util.global private @_device_query_0 : i1
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %value, @_device_query_0 : i1
util.initializer.return
}
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_executable_layout_0 : !hal.executable_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
util.initializer.return
}
util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer {
%_device_query_0 = util.global.load @_device_query_0 : i1
%device = hal.ex.shared_device : !hal.device
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c1_i32 = arith.constant 1 : i32
%c553648160_i32 = arith.constant 553648160 : i32
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::ElideRedundantCommandsPass //----- //
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %value, @_device_query_0 : i1
util.initializer.return
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::ElideRedundantCommandsPass //----- //
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::ElideRedundantCommandsPass //----- //
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
util.initializer.return
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::ElideRedundantCommandsPass //----- //
util.initializer {
%_device_query_0 = util.global.load @_device_query_0 : i1
%device = hal.ex.shared_device : !hal.device
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::ElideRedundantCommandsPass //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c1_i32 = arith.constant 1 : i32
%c553648160_i32 = arith.constant 553648160 : i32
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
// -----// IR Dump After ConvertAffineToStandard //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
util.global private @_device_query_0 : i1
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %value, @_device_query_0 : i1
util.initializer.return
}
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_executable_layout_0 : !hal.executable_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
util.initializer.return
}
util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer {
%_device_query_0 = util.global.load @_device_query_0 : i1
%device = hal.ex.shared_device : !hal.device
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1_0 = arith.constant 1 : index
%0 = arith.cmpi sle, %arg1, %c0 : index
%1 = arith.subi %c0, %arg1 : index
%2 = arith.subi %arg1, %c1_0 : index
%3 = arith.select %0, %1, %2 : index
%4 = arith.divsi %3, %c4 : index
%5 = arith.subi %c0, %4 : index
%6 = arith.addi %4, %c1_0 : index
%7 = arith.select %0, %5, %6 : index
hal.return %7, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c1_i32 = arith.constant 1 : i32
%c553648160_i32 = arith.constant 553648160 : i32
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::CombineInitializersPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.global private @_executable_layout_0 : !hal.executable_layout
util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %value, @_device_query_0 : i1
%device_0 = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device_0 : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device_1 = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device_1 : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
%_device_query_0 = util.global.load @_device_query_0 : i1
%device_2 = hal.ex.shared_device : !hal.device
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device_2 : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
cf.br ^bb4
^bb4: // pred: ^bb3
util.initializer.return
}
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1_0 = arith.constant 1 : index
%0 = arith.cmpi sle, %arg1, %c0 : index
%1 = arith.subi %c0, %arg1 : index
%2 = arith.subi %arg1, %c1_0 : index
%3 = arith.select %0, %1, %2 : index
%4 = arith.divsi %3, %c4 : index
%5 = arith.subi %c0, %4 : index
%6 = arith.addi %4, %c1_0 : index
%7 = arith.select %0, %5, %6 : index
hal.return %7, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c1_i32 = arith.constant 1 : i32
%c553648160_i32 = arith.constant 553648160 : i32
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After Canonicalizer //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.global private @_executable_layout_0 : !hal.executable_layout
util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %value, @_device_query_0 : i1
%device_0 = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device_0 : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device_1 = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device_1 : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
%_device_query_0 = util.global.load @_device_query_0 : i1
%device_2 = hal.ex.shared_device : !hal.device
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device_2 : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = arith.cmpi sle, %arg1, %c0 : index
%1 = arith.subi %c0, %arg1 : index
%2 = arith.subi %arg1, %c1 : index
%3 = arith.select %0, %1, %2 : index
%4 = arith.divsi %3, %c4 : index
%5 = arith.subi %c0, %4 : index
%6 = arith.addi %4, %c1 : index
%7 = arith.select %0, %5, %6 : index
hal.return %7, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c1_i32 = arith.constant 1 : i32
%c553648160_i32 = arith.constant 553648160 : i32
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After CSE //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.global private @_executable_layout_0 : !hal.executable_layout
util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %value, @_device_query_0 : i1
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
%_device_query_0 = util.global.load @_device_query_0 : i1
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = arith.cmpi sle, %arg1, %c0 : index
%1 = arith.subi %c0, %arg1 : index
%2 = arith.subi %arg1, %c1 : index
%3 = arith.select %0, %1, %2 : index
%4 = arith.divsi %3, %c4 : index
%5 = arith.subi %c0, %4 : index
%6 = arith.addi %4, %c1 : index
%7 = arith.select %0, %5, %6 : index
hal.return %7, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c1_i32 = arith.constant 1 : i32
%c553648160_i32 = arith.constant 553648160 : i32
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.executable_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.global.store %value, @_device_query_0 : i1
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
%c1_i32 = arith.constant 1 : i32
%c553648160_i32 = arith.constant 553648160 : i32
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.global private @_executable_layout_0 : !hal.executable_layout
util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.executable_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.global.store %value, @_device_query_0 : i1
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = arith.cmpi sle, %arg1, %c0 : index
%1 = arith.subi %c0, %arg1 : index
%2 = arith.subi %arg1, %c1 : index
%3 = arith.select %0, %1, %2 : index
%4 = arith.divsi %3, %c4 : index
%5 = arith.subi %c0, %4 : index
%6 = arith.addi %4, %c1 : index
%7 = arith.select %0, %5, %6 : index
hal.return %7, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_executable_layout_0 : !hal.executable_layout
util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.executable_layout
util.global.store %value, @_device_query_0 : i1
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = arith.cmpi sle, %arg1, %c0 : index
%1 = arith.subi %c0, %arg1 : index
%2 = arith.subi %arg1, %c1 : index
%3 = arith.select %0, %1, %2 : index
%4 = arith.divsi %3, %c4 : index
%5 = arith.subi %c0, %4 : index
%6 = arith.addi %4, %c1 : index
%7 = arith.select %0, %5, %6 : index
hal.return %7, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
module attributes {hal.device.targets = [#device_target_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_executable_layout_0 : !hal.executable_layout
util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.executable_layout
util.global.store %value, @_device_query_0 : i1
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
hal.executable private @simple_mul_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = arith.cmpi sle, %arg1, %c0 : index
%1 = arith.subi %c0, %arg1 : index
%2 = arith.subi %arg1, %c1 : index
%3 = arith.select %0, %1, %2 : index
%4 = arith.divsi %3, %c4 : index
%5 = arith.subi %c0, %4 : index
%6 = arith.addi %4, %c1 : index
%7 = arith.select %0, %5, %6 : index
hal.return %7, %c1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(2 : i64) : i64
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(0 : index) : i64
%5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
%8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
%9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
%10 = llvm.and %9, %3 : i64
%11 = llvm.icmp "eq" %10, %4 : i64
"llvm.intr.assume"(%11) : (i1) -> ()
%12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
%16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
%17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
%18 = llvm.and %17, %3 : i64
%19 = llvm.icmp "eq" %18, %4 : i64
"llvm.intr.assume"(%19) : (i1) -> ()
%20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
%26 = llvm.and %25, %3 : i64
%27 = llvm.icmp "eq" %26, %4 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
%31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
%32 = llvm.fmul %29, %31 : vector<4xf32>
%33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
llvm.return %0 : i32
}
}
}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::SerializeTargetExecutablesPass //----- //
hal.executable private @simple_mul_dispatch_0 {
hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0xvector<3808xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::SerializeExecutablesPass //----- //
hal.executable private @simple_mul_dispatch_0 {
hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0xvector<3808xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
}
// -----// IR Dump After SymbolDCE //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
module attributes {hal.device.targets = [#device_target_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_executable_layout_0 : !hal.executable_layout
util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.executable_layout
util.global.store %value, @_device_query_0 : i1
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
hal.executable private @simple_mul_dispatch_0 {
hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0xvector<3808xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After LoopInvariantCodeMotion //----- //
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.executable_layout
util.global.store %value, @_device_query_0 : i1
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
// -----// IR Dump After SCFToControlFlow //----- //
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.executable_layout
util.global.store %value, @_device_query_0 : i1
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
// -----// IR Dump After LoopCoalescing //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
// -----// IR Dump After LoopInvariantCodeMotion //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
// -----// IR Dump After SCFToControlFlow //----- //
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
// -----// IR Dump After Canonicalizer //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
module attributes {hal.device.targets = [#device_target_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_executable_layout_0 : !hal.executable_layout
util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.executable_layout
util.global.store %value, @_device_query_0 : i1
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
hal.executable private @simple_mul_dispatch_0 {
hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0xvector<3808xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After CSE //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
module attributes {hal.device.targets = [#device_target_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_executable_layout_0 : !hal.executable_layout
util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.executable_layout
util.global.store %value, @_device_query_0 : i1
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
util.initializer.return
}
hal.executable private @simple_mul_dispatch_0 {
hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0xvector<3808xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
}
func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c16],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::VM::ConversionPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
module attributes {hal.device.targets = [#device_target_cpu], vm.toplevel} {
vm.module public @module {
vm.global.i32 private @_device_query_0 : i32
vm.global.ref private @_executable_layout_0 : !vm.ref<!hal.executable_layout>
vm.global.ref private @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
vm.rodata private @simple_mul_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0xvector<3808xi8>
vm.initializer {
%ref = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
%buffer = vm.rodata.inline "_utf8_hal_executable_format_EAB228F999C2D3A1" {alignment = 1 : i64} : !vm.buffer = dense<[104, 97, 108, 46, 101, 120, 101, 99, 117, 116, 97, 98, 108, 101, 46, 102, 111, 114, 109, 97, 116]> : vector<21xi8>
%buffer_0 = vm.rodata.inline "_utf8_embedded_elf_x86_64_9FD8733DA4A6F228" {alignment = 1 : i64} : !vm.buffer = dense<[101, 109, 98, 101, 100, 100, 101, 100, 45, 101, 108, 102, 45, 120, 56, 54, 95, 54, 52]> : vector<19xi8>
%0:2 = vm.call @hal.device.query.i32(%ref, %buffer, %buffer_0) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i32)
%c1 = vm.const.i32 1
%1 = vm.and.i32 %0#1, %c1 : i32
%zero = vm.const.i32.zero
%2 = vm.select.i32 %0#0, %1, %zero : i32
%c1_1 = vm.const.i32 1
%c1_2 = vm.const.i32 1
%zero_3 = vm.const.i32.zero
%c7 = vm.const.i32 7
%c1_4 = vm.const.i32 1
%c7_5 = vm.const.i32 7
%c2 = vm.const.i32 2
%c7_6 = vm.const.i32 7
%ref_7 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1_2, [(%zero_3, %c7), (%c1_4, %c7_5), (%c2, %c7_6)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
%zero_8 = vm.const.i32.zero
%ref_9 = vm.call.variadic @hal.executable_layout.create(%ref, %zero_8, [%ref_7]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
vm.global.store.i32 %2, @_device_query_0 : i32
vm.global.store.ref %ref_9, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
vm.cond_br %2, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%buffer_10 = vm.rodata.inline "_utf8_embedded_elf_x86_64_9FD8733DA4A6F228" {alignment = 1 : i64} : !vm.buffer = dense<[101, 109, 98, 101, 100, 100, 101, 100, 45, 101, 108, 102, 45, 120, 56, 54, 95, 54, 52]> : vector<19xi8>
%simple_mul_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @simple_mul_dispatch_0_embedded_elf_x86_64 : !vm.buffer
%null = vm.const.ref.zero : !vm.buffer
%ref_11 = vm.call.variadic @hal.executable.create(%ref, %buffer_10, %simple_mul_dispatch_0_embedded_elf_x86_64, %null, [%_executable_layout_0]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
vm.br ^bb3(%ref_11 : !vm.ref<!hal.executable>)
^bb2: // pred: ^bb0
%null_12 = vm.const.ref.zero : !vm.ref<!hal.executable>
vm.br ^bb3(%null_12 : !vm.ref<!hal.executable>)
^bb3(%3: !vm.ref<!hal.executable>): // 2 preds: ^bb1, ^bb2
vm.global.store.ref %3, @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
vm.return
}
vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.allocator.map.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %try : i32, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32 attributes {sym_visibility = "private"}
vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %pattern : i32, %pattern_length : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %offset : i32, %values : i32 ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.bind_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %descriptor_set : !vm.ref<!hal.descriptor_set>, %dynamic_offsets : i64 ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64) attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set.create(%device : !vm.ref<!hal.device>, %set_layout : !vm.ref<!hal.descriptor_set_layout>, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) -> !vm.ref<!hal.descriptor_set> attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.query.i32(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i32) attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i64) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.query(%semaphore : !vm.ref<!hal.semaphore>) -> (i32, i64) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i64) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.fail(%semaphore : !vm.ref<!hal.semaphore>, %status : i32) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i64) -> i32 attributes {sym_visibility = "private"}
vm.func private @simple_mul(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> {
%c2 = vm.const.i32 2
%zero = vm.const.i32.zero
%c16 = vm.const.i32 16
%c4 = vm.const.i32 4
%c1 = vm.const.i32 1
%c553648160 = vm.const.i32 553648160
%c1_0 = vm.const.i32 1
%_device_query_0 = vm.global.load.i32 @_device_query_0 : i32
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%_executable_simple_mul_dispatch_0 = vm.global.load.ref @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
%buffer = vm.rodata.inline "_utf8_tensor_3C6209B4FD120BDC" {alignment = 1 : i64} : !vm.buffer = dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
%c4_1 = vm.const.i64 4
vm.call.variadic @hal.buffer_view.assert(%arg0, %buffer, %c553648160, %c1_0, [%c4_1]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
%ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
%ref_3 = vm.call @hal.device.allocator(%ref_2) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
%buffer_4 = vm.rodata.inline "_utf8_tensor_3C6209B4FD120BDC" {alignment = 1 : i64} : !vm.buffer = dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
%c16_5 = vm.const.i64 16
%c16_6 = vm.const.i32 16
%c3075 = vm.const.i32 3075
vm.call @hal.buffer.assert(%ref, %buffer_4, %ref_3, %c16_5, %c16_6, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
%buffer_7 = vm.rodata.inline "_utf8_tensor_3C6209B4FD120BDC" {alignment = 1 : i64} : !vm.buffer = dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
%c4_8 = vm.const.i64 4
vm.call.variadic @hal.buffer_view.assert(%arg1, %buffer_7, %c553648160, %c1_0, [%c4_8]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
%ref_9 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%buffer_10 = vm.rodata.inline "_utf8_tensor_3C6209B4FD120BDC" {alignment = 1 : i64} : !vm.buffer = dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
%c16_11 = vm.const.i64 16
%c16_12 = vm.const.i32 16
%c3075_13 = vm.const.i32 3075
vm.call @hal.buffer.assert(%ref_9, %buffer_10, %ref_3, %c16_11, %c16_12, %c3075_13) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
%c50 = vm.const.i32 50
%c150998019 = vm.const.i32 150998019
%c16_14 = vm.const.i64 16
%ref_15 = vm.call @hal.allocator.allocate(%ref_3, %c50, %c150998019, %c16_14) : (!vm.ref<!hal.allocator>, i32, i32, i64) -> !vm.ref<!hal.buffer>
%c17 = vm.const.i32 17
%c3 = vm.const.i32 3
%ref_16 = vm.call @hal.command_buffer.create(%ref_2, %c17, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
vm.call @hal.command_buffer.begin(%ref_16) : (!vm.ref<!hal.command_buffer>) -> ()
vm.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%zero_17 = vm.const.i64.zero
%c16_18 = vm.const.i64 16
%zero_19 = vm.const.i64.zero
%c16_20 = vm.const.i64 16
%zero_21 = vm.const.i64.zero
%c16_22 = vm.const.i64 16
vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_16, %_executable_layout_0, %zero, [(%zero, %ref, %zero_17, %c16_18), (%c1, %ref_9, %zero_19, %c16_20), (%c2, %ref_15, %zero_21, %c16_22)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...)
%zero_23 = vm.const.i32.zero
vm.call @hal.command_buffer.dispatch(%ref_16, %_executable_simple_mul_dispatch_0, %zero_23, %c1, %c1, %c1) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
%c28 = vm.const.i32 28
%c13 = vm.const.i32 13
%zero_24 = vm.const.i32.zero
vm.call @hal.command_buffer.execution_barrier(%ref_16, %c28, %c13, %zero_24) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
vm.call @hal.command_buffer.end(%ref_16) : (!vm.ref<!hal.command_buffer>) -> ()
vm.call @hal.ex.submit_and_wait(%ref_2, %ref_16) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
%c4_25 = vm.const.i64 4
%ref_26 = vm.call.variadic @hal.buffer_view.create(%ref_15, %c553648160, %c1_0, [%c4_25]) {nosideeffects} : (!vm.ref<!hal.buffer>, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
vm.return %ref_26 : !vm.ref<!hal.buffer_view>
^bb2: // pred: ^bb0
%c2_27 = vm.const.i32 2
vm.fail %c2_27, "device not supported in the compiled configuration"
}
vm.export @simple_mul
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::VM::HoistInlinedRodataPass //----- //
vm.module public @module {
vm.global.i32 private @_device_query_0 : i32
vm.global.ref private @_executable_layout_0 : !vm.ref<!hal.executable_layout>
vm.global.ref private @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
vm.rodata private @simple_mul_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0xvector<3808xi8>
vm.rodata private @_utf8_hal_executable_format_EAB228F999C2D3A1 {alignment = 1 : i64} dense<[104, 97, 108, 46, 101, 120, 101, 99, 117, 116, 97, 98, 108, 101, 46, 102, 111, 114, 109, 97, 116]> : vector<21xi8>
vm.rodata private @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 {alignment = 1 : i64} dense<[101, 109, 98, 101, 100, 100, 101, 100, 45, 101, 108, 102, 45, 120, 56, 54, 95, 54, 52]> : vector<19xi8>
vm.rodata private @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_0 {alignment = 1 : i64} dense<[101, 109, 98, 101, 100, 100, 101, 100, 45, 101, 108, 102, 45, 120, 56, 54, 95, 54, 52]> : vector<19xi8>
vm.initializer {
%ref = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
%_utf8_hal_executable_format_EAB228F999C2D3A1 = vm.const.ref.rodata @_utf8_hal_executable_format_EAB228F999C2D3A1 : !vm.buffer
%_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
%0:2 = vm.call @hal.device.query.i32(%ref, %_utf8_hal_executable_format_EAB228F999C2D3A1, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i32)
%c1 = vm.const.i32 1
%1 = vm.and.i32 %0#1, %c1 : i32
%zero = vm.const.i32.zero
%2 = vm.select.i32 %0#0, %1, %zero : i32
%c1_0 = vm.const.i32 1
%c1_1 = vm.const.i32 1
%zero_2 = vm.const.i32.zero
%c7 = vm.const.i32 7
%c1_3 = vm.const.i32 1
%c7_4 = vm.const.i32 7
%c2 = vm.const.i32 2
%c7_5 = vm.const.i32 7
%ref_6 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1_1, [(%zero_2, %c7), (%c1_3, %c7_4), (%c2, %c7_5)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
%zero_7 = vm.const.i32.zero
%ref_8 = vm.call.variadic @hal.executable_layout.create(%ref, %zero_7, [%ref_6]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
vm.global.store.i32 %2, @_device_query_0 : i32
vm.global.store.ref %ref_8, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
vm.cond_br %2, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_0 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_0 : !vm.buffer
%simple_mul_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @simple_mul_dispatch_0_embedded_elf_x86_64 : !vm.buffer
%null = vm.const.ref.zero : !vm.buffer
%ref_9 = vm.call.variadic @hal.executable.create(%ref, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_0, %simple_mul_dispatch_0_embedded_elf_x86_64, %null, [%_executable_layout_0]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
vm.br ^bb3(%ref_9 : !vm.ref<!hal.executable>)
^bb2: // pred: ^bb0
%null_10 = vm.const.ref.zero : !vm.ref<!hal.executable>
vm.br ^bb3(%null_10 : !vm.ref<!hal.executable>)
^bb3(%3: !vm.ref<!hal.executable>): // 2 preds: ^bb1, ^bb2
vm.global.store.ref %3, @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
vm.return
}
vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.allocator.map.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %try : i32, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32 attributes {sym_visibility = "private"}
vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %pattern : i32, %pattern_length : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %offset : i32, %values : i32 ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.bind_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %descriptor_set : !vm.ref<!hal.descriptor_set>, %dynamic_offsets : i64 ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64) attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set.create(%device : !vm.ref<!hal.device>, %set_layout : !vm.ref<!hal.descriptor_set_layout>, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) -> !vm.ref<!hal.descriptor_set> attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.query.i32(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i32) attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i64) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.query(%semaphore : !vm.ref<!hal.semaphore>) -> (i32, i64) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i64) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.fail(%semaphore : !vm.ref<!hal.semaphore>, %status : i32) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i64) -> i32 attributes {sym_visibility = "private"}
vm.rodata private @_utf8_tensor_3C6209B4FD120BDC {alignment = 1 : i64} dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
vm.rodata private @_utf8_tensor_3C6209B4FD120BDC_1 {alignment = 1 : i64} dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
vm.rodata private @_utf8_tensor_3C6209B4FD120BDC_2 {alignment = 1 : i64} dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
vm.rodata private @_utf8_tensor_3C6209B4FD120BDC_3 {alignment = 1 : i64} dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
vm.func private @simple_mul(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> {
%c2 = vm.const.i32 2
%zero = vm.const.i32.zero
%c16 = vm.const.i32 16
%c4 = vm.const.i32 4
%c1 = vm.const.i32 1
%c553648160 = vm.const.i32 553648160
%c1_0 = vm.const.i32 1
%_device_query_0 = vm.global.load.i32 @_device_query_0 : i32
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%_executable_simple_mul_dispatch_0 = vm.global.load.ref @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
%_utf8_tensor_3C6209B4FD120BDC = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
%c4_1 = vm.const.i64 4
vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_tensor_3C6209B4FD120BDC, %c553648160, %c1_0, [%c4_1]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
%ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
%ref_3 = vm.call @hal.device.allocator(%ref_2) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
%_utf8_tensor_3C6209B4FD120BDC_1 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC_1 : !vm.buffer
%c16_4 = vm.const.i64 16
%c16_5 = vm.const.i32 16
%c3075 = vm.const.i32 3075
vm.call @hal.buffer.assert(%ref, %_utf8_tensor_3C6209B4FD120BDC_1, %ref_3, %c16_4, %c16_5, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
%_utf8_tensor_3C6209B4FD120BDC_2 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC_2 : !vm.buffer
%c4_6 = vm.const.i64 4
vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_tensor_3C6209B4FD120BDC_2, %c553648160, %c1_0, [%c4_6]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
%ref_7 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%_utf8_tensor_3C6209B4FD120BDC_3 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC_3 : !vm.buffer
%c16_8 = vm.const.i64 16
%c16_9 = vm.const.i32 16
%c3075_10 = vm.const.i32 3075
vm.call @hal.buffer.assert(%ref_7, %_utf8_tensor_3C6209B4FD120BDC_3, %ref_3, %c16_8, %c16_9, %c3075_10) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
%c50 = vm.const.i32 50
%c150998019 = vm.const.i32 150998019
%c16_11 = vm.const.i64 16
%ref_12 = vm.call @hal.allocator.allocate(%ref_3, %c50, %c150998019, %c16_11) : (!vm.ref<!hal.allocator>, i32, i32, i64) -> !vm.ref<!hal.buffer>
%c17 = vm.const.i32 17
%c3 = vm.const.i32 3
%ref_13 = vm.call @hal.command_buffer.create(%ref_2, %c17, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
vm.call @hal.command_buffer.begin(%ref_13) : (!vm.ref<!hal.command_buffer>) -> ()
vm.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%zero_14 = vm.const.i64.zero
%c16_15 = vm.const.i64 16
%zero_16 = vm.const.i64.zero
%c16_17 = vm.const.i64 16
%zero_18 = vm.const.i64.zero
%c16_19 = vm.const.i64 16
vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_13, %_executable_layout_0, %zero, [(%zero, %ref, %zero_14, %c16_15), (%c1, %ref_7, %zero_16, %c16_17), (%c2, %ref_12, %zero_18, %c16_19)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...)
%zero_20 = vm.const.i32.zero
vm.call @hal.command_buffer.dispatch(%ref_13, %_executable_simple_mul_dispatch_0, %zero_20, %c1, %c1, %c1) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
%c28 = vm.const.i32 28
%c13 = vm.const.i32 13
%zero_21 = vm.const.i32.zero
vm.call @hal.command_buffer.execution_barrier(%ref_13, %c28, %c13, %zero_21) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
vm.call @hal.command_buffer.end(%ref_13) : (!vm.ref<!hal.command_buffer>) -> ()
vm.call @hal.ex.submit_and_wait(%ref_2, %ref_13) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
%c4_22 = vm.const.i64 4
%ref_23 = vm.call.variadic @hal.buffer_view.create(%ref_12, %c553648160, %c1_0, [%c4_22]) {nosideeffects} : (!vm.ref<!hal.buffer>, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
vm.return %ref_23 : !vm.ref<!hal.buffer_view>
^bb2: // pred: ^bb0
%c2_24 = vm.const.i32 2
vm.fail %c2_24, "device not supported in the compiled configuration"
}
vm.export @simple_mul
}
// -----// IR Dump After mlir::iree_compiler::IREE::VM::DeduplicateRodataPass //----- //
vm.module public @module {
vm.global.i32 private @_device_query_0 : i32
vm.global.ref private @_executable_layout_0 : !vm.ref<!hal.executable_layout>
vm.global.ref private @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
vm.rodata private @simple_mul_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0xvector<3808xi8>
vm.rodata private @_utf8_hal_executable_format_EAB228F999C2D3A1 {alignment = 1 : i64} dense<[104, 97, 108, 46, 101, 120, 101, 99, 117, 116, 97, 98, 108, 101, 46, 102, 111, 114, 109, 97, 116]> : vector<21xi8>
vm.rodata private @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 {alignment = 1 : i64} dense<[101, 109, 98, 101, 100, 100, 101, 100, 45, 101, 108, 102, 45, 120, 56, 54, 95, 54, 52]> : vector<19xi8>
vm.initializer {
%ref = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
%_utf8_hal_executable_format_EAB228F999C2D3A1 = vm.const.ref.rodata @_utf8_hal_executable_format_EAB228F999C2D3A1 : !vm.buffer
%_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
%0:2 = vm.call @hal.device.query.i32(%ref, %_utf8_hal_executable_format_EAB228F999C2D3A1, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i32)
%c1 = vm.const.i32 1
%1 = vm.and.i32 %0#1, %c1 : i32
%zero = vm.const.i32.zero
%2 = vm.select.i32 %0#0, %1, %zero : i32
%c1_0 = vm.const.i32 1
%c1_1 = vm.const.i32 1
%zero_2 = vm.const.i32.zero
%c7 = vm.const.i32 7
%c1_3 = vm.const.i32 1
%c7_4 = vm.const.i32 7
%c2 = vm.const.i32 2
%c7_5 = vm.const.i32 7
%ref_6 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1_1, [(%zero_2, %c7), (%c1_3, %c7_4), (%c2, %c7_5)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
%zero_7 = vm.const.i32.zero
%ref_8 = vm.call.variadic @hal.executable_layout.create(%ref, %zero_7, [%ref_6]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
vm.global.store.i32 %2, @_device_query_0 : i32
vm.global.store.ref %ref_8, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
vm.cond_br %2, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_9 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
%simple_mul_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @simple_mul_dispatch_0_embedded_elf_x86_64 : !vm.buffer
%null = vm.const.ref.zero : !vm.buffer
%ref_10 = vm.call.variadic @hal.executable.create(%ref, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_9, %simple_mul_dispatch_0_embedded_elf_x86_64, %null, [%_executable_layout_0]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
vm.br ^bb3(%ref_10 : !vm.ref<!hal.executable>)
^bb2: // pred: ^bb0
%null_11 = vm.const.ref.zero : !vm.ref<!hal.executable>
vm.br ^bb3(%null_11 : !vm.ref<!hal.executable>)
^bb3(%3: !vm.ref<!hal.executable>): // 2 preds: ^bb1, ^bb2
vm.global.store.ref %3, @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
vm.return
}
vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.allocator.map.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %try : i32, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32 attributes {sym_visibility = "private"}
vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %pattern : i32, %pattern_length : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %offset : i32, %values : i32 ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.bind_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %descriptor_set : !vm.ref<!hal.descriptor_set>, %dynamic_offsets : i64 ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64) attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set.create(%device : !vm.ref<!hal.device>, %set_layout : !vm.ref<!hal.descriptor_set_layout>, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) -> !vm.ref<!hal.descriptor_set> attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.query.i32(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i32) attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i64) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.query(%semaphore : !vm.ref<!hal.semaphore>) -> (i32, i64) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i64) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.fail(%semaphore : !vm.ref<!hal.semaphore>, %status : i32) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i64) -> i32 attributes {sym_visibility = "private"}
vm.rodata private @_utf8_tensor_3C6209B4FD120BDC {alignment = 1 : i64} dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
vm.func private @simple_mul(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> {
%c2 = vm.const.i32 2
%zero = vm.const.i32.zero
%c16 = vm.const.i32 16
%c4 = vm.const.i32 4
%c1 = vm.const.i32 1
%c553648160 = vm.const.i32 553648160
%c1_0 = vm.const.i32 1
%_device_query_0 = vm.global.load.i32 @_device_query_0 : i32
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%_executable_simple_mul_dispatch_0 = vm.global.load.ref @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
%_utf8_tensor_3C6209B4FD120BDC = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
%c4_1 = vm.const.i64 4
vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_tensor_3C6209B4FD120BDC, %c553648160, %c1_0, [%c4_1]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
%ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
%ref_3 = vm.call @hal.device.allocator(%ref_2) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
%_utf8_tensor_3C6209B4FD120BDC_4 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
%c16_5 = vm.const.i64 16
%c16_6 = vm.const.i32 16
%c3075 = vm.const.i32 3075
vm.call @hal.buffer.assert(%ref, %_utf8_tensor_3C6209B4FD120BDC_4, %ref_3, %c16_5, %c16_6, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
%_utf8_tensor_3C6209B4FD120BDC_7 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
%c4_8 = vm.const.i64 4
vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_tensor_3C6209B4FD120BDC_7, %c553648160, %c1_0, [%c4_8]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
%ref_9 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%_utf8_tensor_3C6209B4FD120BDC_10 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
%c16_11 = vm.const.i64 16
%c16_12 = vm.const.i32 16
%c3075_13 = vm.const.i32 3075
vm.call @hal.buffer.assert(%ref_9, %_utf8_tensor_3C6209B4FD120BDC_10, %ref_3, %c16_11, %c16_12, %c3075_13) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
%c50 = vm.const.i32 50
%c150998019 = vm.const.i32 150998019
%c16_14 = vm.const.i64 16
%ref_15 = vm.call @hal.allocator.allocate(%ref_3, %c50, %c150998019, %c16_14) : (!vm.ref<!hal.allocator>, i32, i32, i64) -> !vm.ref<!hal.buffer>
%c17 = vm.const.i32 17
%c3 = vm.const.i32 3
%ref_16 = vm.call @hal.command_buffer.create(%ref_2, %c17, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
vm.call @hal.command_buffer.begin(%ref_16) : (!vm.ref<!hal.command_buffer>) -> ()
vm.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%zero_17 = vm.const.i64.zero
%c16_18 = vm.const.i64 16
%zero_19 = vm.const.i64.zero
%c16_20 = vm.const.i64 16
%zero_21 = vm.const.i64.zero
%c16_22 = vm.const.i64 16
vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_16, %_executable_layout_0, %zero, [(%zero, %ref, %zero_17, %c16_18), (%c1, %ref_9, %zero_19, %c16_20), (%c2, %ref_15, %zero_21, %c16_22)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...)
%zero_23 = vm.const.i32.zero
vm.call @hal.command_buffer.dispatch(%ref_16, %_executable_simple_mul_dispatch_0, %zero_23, %c1, %c1, %c1) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
%c28 = vm.const.i32 28
%c13 = vm.const.i32 13
%zero_24 = vm.const.i32.zero
vm.call @hal.command_buffer.execution_barrier(%ref_16, %c28, %c13, %zero_24) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
vm.call @hal.command_buffer.end(%ref_16) : (!vm.ref<!hal.command_buffer>) -> ()
vm.call @hal.ex.submit_and_wait(%ref_2, %ref_16) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
%c4_25 = vm.const.i64 4
%ref_26 = vm.call.variadic @hal.buffer_view.create(%ref_15, %c553648160, %c1_0, [%c4_25]) {nosideeffects} : (!vm.ref<!hal.buffer>, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
vm.return %ref_26 : !vm.ref<!hal.buffer_view>
^bb2: // pred: ^bb0
%c2_27 = vm.const.i32 2
vm.fail %c2_27, "device not supported in the compiled configuration"
}
vm.export @simple_mul
}
// -----// IR Dump After mlir::iree_compiler::IREE::VM::GlobalInitializationPass //----- //
vm.module public @module {
vm.global.i32 private mutable @_device_query_0 : i32
vm.global.ref private mutable @_executable_layout_0 : !vm.ref<!hal.executable_layout>
vm.global.ref private mutable @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
vm.rodata private @simple_mul_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0xvector<3808xi8>
vm.rodata private @_utf8_hal_executable_format_EAB228F999C2D3A1 {alignment = 1 : i64} dense<[104, 97, 108, 46, 101, 120, 101, 99, 117, 116, 97, 98, 108, 101, 46, 102, 111, 114, 109, 97, 116]> : vector<21xi8>
vm.rodata private @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 {alignment = 1 : i64} dense<[101, 109, 98, 101, 100, 100, 101, 100, 45, 101, 108, 102, 45, 120, 56, 54, 95, 54, 52]> : vector<19xi8>
vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.allocator.map.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %try : i32, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32 attributes {sym_visibility = "private"}
vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %pattern : i32, %pattern_length : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %offset : i32, %values : i32 ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.bind_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %descriptor_set : !vm.ref<!hal.descriptor_set>, %dynamic_offsets : i64 ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64) attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set.create(%device : !vm.ref<!hal.device>, %set_layout : !vm.ref<!hal.descriptor_set_layout>, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) -> !vm.ref<!hal.descriptor_set> attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.query.i32(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i32) attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i64) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.query(%semaphore : !vm.ref<!hal.semaphore>) -> (i32, i64) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i64) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.fail(%semaphore : !vm.ref<!hal.semaphore>, %status : i32) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i64) -> i32 attributes {sym_visibility = "private"}
vm.rodata private @_utf8_tensor_3C6209B4FD120BDC {alignment = 1 : i64} dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
vm.func private @simple_mul(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> {
%c2 = vm.const.i32 2
%zero = vm.const.i32.zero
%c16 = vm.const.i32 16
%c4 = vm.const.i32 4
%c1 = vm.const.i32 1
%c553648160 = vm.const.i32 553648160
%c1_0 = vm.const.i32 1
%_device_query_0 = vm.global.load.i32 @_device_query_0 : i32
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%_executable_simple_mul_dispatch_0 = vm.global.load.ref @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
%_utf8_tensor_3C6209B4FD120BDC = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
%c4_1 = vm.const.i64 4
vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_tensor_3C6209B4FD120BDC, %c553648160, %c1_0, [%c4_1]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
%ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
%ref_3 = vm.call @hal.device.allocator(%ref_2) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
%_utf8_tensor_3C6209B4FD120BDC_4 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
%c16_5 = vm.const.i64 16
%c16_6 = vm.const.i32 16
%c3075 = vm.const.i32 3075
vm.call @hal.buffer.assert(%ref, %_utf8_tensor_3C6209B4FD120BDC_4, %ref_3, %c16_5, %c16_6, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
%_utf8_tensor_3C6209B4FD120BDC_7 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
%c4_8 = vm.const.i64 4
vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_tensor_3C6209B4FD120BDC_7, %c553648160, %c1_0, [%c4_8]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
%ref_9 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%_utf8_tensor_3C6209B4FD120BDC_10 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
%c16_11 = vm.const.i64 16
%c16_12 = vm.const.i32 16
%c3075_13 = vm.const.i32 3075
vm.call @hal.buffer.assert(%ref_9, %_utf8_tensor_3C6209B4FD120BDC_10, %ref_3, %c16_11, %c16_12, %c3075_13) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
%c50 = vm.const.i32 50
%c150998019 = vm.const.i32 150998019
%c16_14 = vm.const.i64 16
%ref_15 = vm.call @hal.allocator.allocate(%ref_3, %c50, %c150998019, %c16_14) : (!vm.ref<!hal.allocator>, i32, i32, i64) -> !vm.ref<!hal.buffer>
%c17 = vm.const.i32 17
%c3 = vm.const.i32 3
%ref_16 = vm.call @hal.command_buffer.create(%ref_2, %c17, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
vm.call @hal.command_buffer.begin(%ref_16) : (!vm.ref<!hal.command_buffer>) -> ()
vm.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%zero_17 = vm.const.i64.zero
%c16_18 = vm.const.i64 16
%zero_19 = vm.const.i64.zero
%c16_20 = vm.const.i64 16
%zero_21 = vm.const.i64.zero
%c16_22 = vm.const.i64 16
vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_16, %_executable_layout_0, %zero, [(%zero, %ref, %zero_17, %c16_18), (%c1, %ref_9, %zero_19, %c16_20), (%c2, %ref_15, %zero_21, %c16_22)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...)
%zero_23 = vm.const.i32.zero
vm.call @hal.command_buffer.dispatch(%ref_16, %_executable_simple_mul_dispatch_0, %zero_23, %c1, %c1, %c1) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
%c28 = vm.const.i32 28
%c13 = vm.const.i32 13
%zero_24 = vm.const.i32.zero
vm.call @hal.command_buffer.execution_barrier(%ref_16, %c28, %c13, %zero_24) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
vm.call @hal.command_buffer.end(%ref_16) : (!vm.ref<!hal.command_buffer>) -> ()
vm.call @hal.ex.submit_and_wait(%ref_2, %ref_16) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
%c4_25 = vm.const.i64 4
%ref_26 = vm.call.variadic @hal.buffer_view.create(%ref_15, %c553648160, %c1_0, [%c4_25]) {nosideeffects} : (!vm.ref<!hal.buffer>, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
vm.return %ref_26 : !vm.ref<!hal.buffer_view>
^bb2: // pred: ^bb0
%c2_27 = vm.const.i32 2
vm.fail %c2_27, "device not supported in the compiled configuration"
}
vm.export @simple_mul
vm.export @__init
vm.func private @__init() {
%ref = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
%_utf8_hal_executable_format_EAB228F999C2D3A1 = vm.const.ref.rodata @_utf8_hal_executable_format_EAB228F999C2D3A1 : !vm.buffer
%_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
%0:2 = vm.call @hal.device.query.i32(%ref, %_utf8_hal_executable_format_EAB228F999C2D3A1, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i32)
%c1 = vm.const.i32 1
%1 = vm.and.i32 %0#1, %c1 : i32
%zero = vm.const.i32.zero
%2 = vm.select.i32 %0#0, %1, %zero : i32
%c1_0 = vm.const.i32 1
%c1_1 = vm.const.i32 1
%zero_2 = vm.const.i32.zero
%c7 = vm.const.i32 7
%c1_3 = vm.const.i32 1
%c7_4 = vm.const.i32 7
%c2 = vm.const.i32 2
%c7_5 = vm.const.i32 7
%ref_6 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1_1, [(%zero_2, %c7), (%c1_3, %c7_4), (%c2, %c7_5)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
%zero_7 = vm.const.i32.zero
%ref_8 = vm.call.variadic @hal.executable_layout.create(%ref, %zero_7, [%ref_6]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
vm.global.store.i32 %2, @_device_query_0 : i32
vm.global.store.ref %ref_8, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
vm.cond_br %2, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_9 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
%simple_mul_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @simple_mul_dispatch_0_embedded_elf_x86_64 : !vm.buffer
%null = vm.const.ref.zero : !vm.buffer
%ref_10 = vm.call.variadic @hal.executable.create(%ref, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_9, %simple_mul_dispatch_0_embedded_elf_x86_64, %null, [%_executable_layout_0]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
vm.br ^bb3(%ref_10 : !vm.ref<!hal.executable>)
^bb2: // pred: ^bb0
%null_11 = vm.const.ref.zero : !vm.ref<!hal.executable>
vm.br ^bb3(%null_11 : !vm.ref<!hal.executable>)
^bb3(%3: !vm.ref<!hal.executable>): // 2 preds: ^bb1, ^bb2
vm.global.store.ref %3, @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
vm.br ^bb4
^bb4: // pred: ^bb3
vm.return
}
vm.export @__deinit
vm.func private @__deinit() {
vm.return
}
}
// -----// IR Dump After Canonicalizer //----- //
vm.func private @__deinit() {
vm.return
}
// -----// IR Dump After Canonicalizer //----- //
vm.func private @__init() {
%null = vm.const.ref.zero : !vm.ref<!hal.executable>
%null_0 = vm.const.ref.zero : !vm.buffer
%c2 = vm.const.i32 2
%c7 = vm.const.i32 7
%zero = vm.const.i32.zero
%c1 = vm.const.i32 1
%ref = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
%_utf8_hal_executable_format_EAB228F999C2D3A1 = vm.const.ref.rodata @_utf8_hal_executable_format_EAB228F999C2D3A1 : !vm.buffer
%_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
%0:2 = vm.call @hal.device.query.i32(%ref, %_utf8_hal_executable_format_EAB228F999C2D3A1, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i32)
%1 = vm.and.i32 %0#1, %c1 : i32
%2 = vm.select.i32 %0#0, %1, %zero : i32
%ref_1 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1, [(%zero, %c7), (%c1, %c7), (%c2, %c7)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
%ref_2 = vm.call.variadic @hal.executable_layout.create(%ref, %zero, [%ref_1]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
vm.global.store.i32 %2, @_device_query_0 : i32
vm.global.store.ref %ref_2, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
vm.cond_br %2, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_3 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
%simple_mul_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @simple_mul_dispatch_0_embedded_elf_x86_64 : !vm.buffer
%ref_4 = vm.call.variadic @hal.executable.create(%ref, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_3, %simple_mul_dispatch_0_embedded_elf_x86_64, %null_0, [%_executable_layout_0]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
vm.br ^bb3(%ref_4 : !vm.ref<!hal.executable>)
^bb2: // pred: ^bb0
vm.br ^bb3(%null : !vm.ref<!hal.executable>)
^bb3(%3: !vm.ref<!hal.executable>): // 2 preds: ^bb1, ^bb2
vm.global.store.ref %3, @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
vm.return
}
// -----// IR Dump After Canonicalizer //----- //
vm.func private @simple_mul(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> {
%c13 = vm.const.i32 13
%c28 = vm.const.i32 28
%zero = vm.const.i64.zero
%c3 = vm.const.i32 3
%c17 = vm.const.i32 17
%c150998019 = vm.const.i32 150998019
%c50 = vm.const.i32 50
%c3075 = vm.const.i32 3075
%c16 = vm.const.i64 16
%c4 = vm.const.i64 4
%c2 = vm.const.i32 2
%zero_0 = vm.const.i32.zero
%c16_1 = vm.const.i32 16
%c1 = vm.const.i32 1
%c553648160 = vm.const.i32 553648160
%_device_query_0 = vm.global.load.i32 @_device_query_0 : i32
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%_executable_simple_mul_dispatch_0 = vm.global.load.ref @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
%_utf8_tensor_3C6209B4FD120BDC = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_tensor_3C6209B4FD120BDC, %c553648160, %c1, [%c4]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
%ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
%ref_3 = vm.call @hal.device.allocator(%ref_2) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
%_utf8_tensor_3C6209B4FD120BDC_4 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
vm.call @hal.buffer.assert(%ref, %_utf8_tensor_3C6209B4FD120BDC_4, %ref_3, %c16, %c16_1, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
%_utf8_tensor_3C6209B4FD120BDC_5 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_tensor_3C6209B4FD120BDC_5, %c553648160, %c1, [%c4]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
%ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%_utf8_tensor_3C6209B4FD120BDC_7 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_3C6209B4FD120BDC_7, %ref_3, %c16, %c16_1, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
%ref_8 = vm.call @hal.allocator.allocate(%ref_3, %c50, %c150998019, %c16) : (!vm.ref<!hal.allocator>, i32, i32, i64) -> !vm.ref<!hal.buffer>
%ref_9 = vm.call @hal.command_buffer.create(%ref_2, %c17, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
vm.call @hal.command_buffer.begin(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
vm.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %_executable_layout_0, %zero_0, [(%zero_0, %ref, %zero, %c16), (%c1, %ref_6, %zero, %c16), (%c2, %ref_8, %zero, %c16)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...)
vm.call @hal.command_buffer.dispatch(%ref_9, %_executable_simple_mul_dispatch_0, %zero_0, %c1, %c1, %c1) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero_0) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
vm.call @hal.command_buffer.end(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
vm.call @hal.ex.submit_and_wait(%ref_2, %ref_9) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
%ref_10 = vm.call.variadic @hal.buffer_view.create(%ref_8, %c553648160, %c1, [%c4]) {nosideeffects} : (!vm.ref<!hal.buffer>, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
vm.return %ref_10 : !vm.ref<!hal.buffer_view>
^bb2: // pred: ^bb0
vm.fail %c2, "device not supported in the compiled configuration"
}
// -----// IR Dump After Inliner //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
module attributes {hal.device.targets = [#device_target_cpu], vm.toplevel} {
vm.module public @module {
vm.global.i32 private mutable @_device_query_0 : i32
vm.global.ref private mutable @_executable_layout_0 : !vm.ref<!hal.executable_layout>
vm.global.ref private mutable @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
vm.rodata private @simple_mul_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0xvector<3808xi8>
vm.rodata private @_utf8_hal_executable_format_EAB228F999C2D3A1 {alignment = 1 : i64} dense<[104, 97, 108, 46, 101, 120, 101, 99, 117, 116, 97, 98, 108, 101, 46, 102, 111, 114, 109, 97, 116]> : vector<21xi8>
vm.rodata private @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 {alignment = 1 : i64} dense<[101, 109, 98, 101, 100, 100, 101, 100, 45, 101, 108, 102, 45, 120, 56, 54, 95, 54, 52]> : vector<19xi8>
vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.allocator.map.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %try : i32, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32 attributes {sym_visibility = "private"}
vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %pattern : i32, %pattern_length : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %offset : i32, %values : i32 ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.bind_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %descriptor_set : !vm.ref<!hal.descriptor_set>, %dynamic_offsets : i64 ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64) attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set.create(%device : !vm.ref<!hal.device>, %set_layout : !vm.ref<!hal.descriptor_set_layout>, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) -> !vm.ref<!hal.descriptor_set> attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.query.i32(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i32) attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i64) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.query(%semaphore : !vm.ref<!hal.semaphore>) -> (i32, i64) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i64) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.fail(%semaphore : !vm.ref<!hal.semaphore>, %status : i32) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i64) -> i32 attributes {sym_visibility = "private"}
vm.rodata private @_utf8_tensor_3C6209B4FD120BDC {alignment = 1 : i64} dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
vm.func private @simple_mul(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> {
%c13 = vm.const.i32 13
%c28 = vm.const.i32 28
%zero = vm.const.i64.zero
%c3 = vm.const.i32 3
%c17 = vm.const.i32 17
%c150998019 = vm.const.i32 150998019
%c50 = vm.const.i32 50
%c3075 = vm.const.i32 3075
%c16 = vm.const.i64 16
%c4 = vm.const.i64 4
%c2 = vm.const.i32 2
%zero_0 = vm.const.i32.zero
%c16_1 = vm.const.i32 16
%c1 = vm.const.i32 1
%c553648160 = vm.const.i32 553648160
%_device_query_0 = vm.global.load.i32 @_device_query_0 : i32
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%_executable_simple_mul_dispatch_0 = vm.global.load.ref @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
%_utf8_tensor_3C6209B4FD120BDC = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_tensor_3C6209B4FD120BDC, %c553648160, %c1, [%c4]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
%ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
%ref_3 = vm.call @hal.device.allocator(%ref_2) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
%_utf8_tensor_3C6209B4FD120BDC_4 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
vm.call @hal.buffer.assert(%ref, %_utf8_tensor_3C6209B4FD120BDC_4, %ref_3, %c16, %c16_1, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
%_utf8_tensor_3C6209B4FD120BDC_5 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_tensor_3C6209B4FD120BDC_5, %c553648160, %c1, [%c4]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
%ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%_utf8_tensor_3C6209B4FD120BDC_7 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_3C6209B4FD120BDC_7, %ref_3, %c16, %c16_1, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
%ref_8 = vm.call @hal.allocator.allocate(%ref_3, %c50, %c150998019, %c16) : (!vm.ref<!hal.allocator>, i32, i32, i64) -> !vm.ref<!hal.buffer>
%ref_9 = vm.call @hal.command_buffer.create(%ref_2, %c17, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
vm.call @hal.command_buffer.begin(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
vm.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %_executable_layout_0, %zero_0, [(%zero_0, %ref, %zero, %c16), (%c1, %ref_6, %zero, %c16), (%c2, %ref_8, %zero, %c16)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...)
vm.call @hal.command_buffer.dispatch(%ref_9, %_executable_simple_mul_dispatch_0, %zero_0, %c1, %c1, %c1) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero_0) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
vm.call @hal.command_buffer.end(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
vm.call @hal.ex.submit_and_wait(%ref_2, %ref_9) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
%ref_10 = vm.call.variadic @hal.buffer_view.create(%ref_8, %c553648160, %c1, [%c4]) {nosideeffects} : (!vm.ref<!hal.buffer>, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
vm.return %ref_10 : !vm.ref<!hal.buffer_view>
^bb2: // pred: ^bb0
vm.fail %c2, "device not supported in the compiled configuration"
}
vm.export @simple_mul
vm.export @__init
vm.func private @__init() {
%null = vm.const.ref.zero : !vm.ref<!hal.executable>
%null_0 = vm.const.ref.zero : !vm.buffer
%c2 = vm.const.i32 2
%c7 = vm.const.i32 7
%zero = vm.const.i32.zero
%c1 = vm.const.i32 1
%ref = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
%_utf8_hal_executable_format_EAB228F999C2D3A1 = vm.const.ref.rodata @_utf8_hal_executable_format_EAB228F999C2D3A1 : !vm.buffer
%_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
%0:2 = vm.call @hal.device.query.i32(%ref, %_utf8_hal_executable_format_EAB228F999C2D3A1, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i32)
%1 = vm.and.i32 %0#1, %c1 : i32
%2 = vm.select.i32 %0#0, %1, %zero : i32
%ref_1 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1, [(%zero, %c7), (%c1, %c7), (%c2, %c7)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
%ref_2 = vm.call.variadic @hal.executable_layout.create(%ref, %zero, [%ref_1]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
vm.global.store.i32 %2, @_device_query_0 : i32
vm.global.store.ref %ref_2, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
vm.cond_br %2, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_3 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
%simple_mul_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @simple_mul_dispatch_0_embedded_elf_x86_64 : !vm.buffer
%ref_4 = vm.call.variadic @hal.executable.create(%ref, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_3, %simple_mul_dispatch_0_embedded_elf_x86_64, %null_0, [%_executable_layout_0]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
vm.br ^bb3(%ref_4 : !vm.ref<!hal.executable>)
^bb2: // pred: ^bb0
vm.br ^bb3(%null : !vm.ref<!hal.executable>)
^bb3(%3: !vm.ref<!hal.executable>): // 2 preds: ^bb1, ^bb2
vm.global.store.ref %3, @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
vm.return
}
vm.export @__deinit
vm.func private @__deinit() {
vm.return
}
}
}
// -----// IR Dump After Canonicalizer //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
module attributes {hal.device.targets = [#device_target_cpu], vm.toplevel} {
vm.module public @module {
vm.global.i32 private mutable @_device_query_0 : i32
vm.global.ref private mutable @_executable_layout_0 : !vm.ref<!hal.executable_layout>
vm.global.ref private mutable @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
vm.rodata private @simple_mul_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0xvector<3808xi8>
vm.rodata private @_utf8_hal_executable_format_EAB228F999C2D3A1 {alignment = 1 : i64} dense<[104, 97, 108, 46, 101, 120, 101, 99, 117, 116, 97, 98, 108, 101, 46, 102, 111, 114, 109, 97, 116]> : vector<21xi8>
vm.rodata private @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 {alignment = 1 : i64} dense<[101, 109, 98, 101, 100, 100, 101, 100, 45, 101, 108, 102, 45, 120, 56, 54, 95, 54, 52]> : vector<19xi8>
vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.allocator.map.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %try : i32, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32 attributes {sym_visibility = "private"}
vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %pattern : i32, %pattern_length : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %offset : i32, %values : i32 ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.bind_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %descriptor_set : !vm.ref<!hal.descriptor_set>, %dynamic_offsets : i64 ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64) attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set.create(%device : !vm.ref<!hal.device>, %set_layout : !vm.ref<!hal.descriptor_set_layout>, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) -> !vm.ref<!hal.descriptor_set> attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.query.i32(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i32) attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i64) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.query(%semaphore : !vm.ref<!hal.semaphore>) -> (i32, i64) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i64) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.fail(%semaphore : !vm.ref<!hal.semaphore>, %status : i32) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i64) -> i32 attributes {sym_visibility = "private"}
vm.rodata private @_utf8_tensor_3C6209B4FD120BDC {alignment = 1 : i64} dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
vm.func private @simple_mul(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> {
%c13 = vm.const.i32 13
%c28 = vm.const.i32 28
%zero = vm.const.i64.zero
%c3 = vm.const.i32 3
%c17 = vm.const.i32 17
%c150998019 = vm.const.i32 150998019
%c50 = vm.const.i32 50
%c3075 = vm.const.i32 3075
%c16 = vm.const.i64 16
%c4 = vm.const.i64 4
%c2 = vm.const.i32 2
%zero_0 = vm.const.i32.zero
%c16_1 = vm.const.i32 16
%c1 = vm.const.i32 1
%c553648160 = vm.const.i32 553648160
%_device_query_0 = vm.global.load.i32 @_device_query_0 : i32
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%_executable_simple_mul_dispatch_0 = vm.global.load.ref @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
%_utf8_tensor_3C6209B4FD120BDC = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_tensor_3C6209B4FD120BDC, %c553648160, %c1, [%c4]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
%ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
%ref_3 = vm.call @hal.device.allocator(%ref_2) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
%_utf8_tensor_3C6209B4FD120BDC_4 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
vm.call @hal.buffer.assert(%ref, %_utf8_tensor_3C6209B4FD120BDC_4, %ref_3, %c16, %c16_1, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
%_utf8_tensor_3C6209B4FD120BDC_5 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_tensor_3C6209B4FD120BDC_5, %c553648160, %c1, [%c4]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
%ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%_utf8_tensor_3C6209B4FD120BDC_7 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_3C6209B4FD120BDC_7, %ref_3, %c16, %c16_1, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
%ref_8 = vm.call @hal.allocator.allocate(%ref_3, %c50, %c150998019, %c16) : (!vm.ref<!hal.allocator>, i32, i32, i64) -> !vm.ref<!hal.buffer>
%ref_9 = vm.call @hal.command_buffer.create(%ref_2, %c17, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
vm.call @hal.command_buffer.begin(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
vm.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %_executable_layout_0, %zero_0, [(%zero_0, %ref, %zero, %c16), (%c1, %ref_6, %zero, %c16), (%c2, %ref_8, %zero, %c16)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...)
vm.call @hal.command_buffer.dispatch(%ref_9, %_executable_simple_mul_dispatch_0, %zero_0, %c1, %c1, %c1) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero_0) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
vm.call @hal.command_buffer.end(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
vm.call @hal.ex.submit_and_wait(%ref_2, %ref_9) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
%ref_10 = vm.call.variadic @hal.buffer_view.create(%ref_8, %c553648160, %c1, [%c4]) {nosideeffects} : (!vm.ref<!hal.buffer>, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
vm.return %ref_10 : !vm.ref<!hal.buffer_view>
^bb2: // pred: ^bb0
vm.fail %c2, "device not supported in the compiled configuration"
}
vm.export @simple_mul
vm.export @__init
vm.func private @__init() {
%null = vm.const.ref.zero : !vm.ref<!hal.executable>
%null_0 = vm.const.ref.zero : !vm.buffer
%c2 = vm.const.i32 2
%c7 = vm.const.i32 7
%zero = vm.const.i32.zero
%c1 = vm.const.i32 1
%ref = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
%_utf8_hal_executable_format_EAB228F999C2D3A1 = vm.const.ref.rodata @_utf8_hal_executable_format_EAB228F999C2D3A1 : !vm.buffer
%_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
%0:2 = vm.call @hal.device.query.i32(%ref, %_utf8_hal_executable_format_EAB228F999C2D3A1, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i32)
%1 = vm.and.i32 %0#1, %c1 : i32
%2 = vm.select.i32 %0#0, %1, %zero : i32
%ref_1 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1, [(%zero, %c7), (%c1, %c7), (%c2, %c7)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
%ref_2 = vm.call.variadic @hal.executable_layout.create(%ref, %zero, [%ref_1]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
vm.global.store.i32 %2, @_device_query_0 : i32
vm.global.store.ref %ref_2, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
vm.cond_br %2, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_3 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
%simple_mul_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @simple_mul_dispatch_0_embedded_elf_x86_64 : !vm.buffer
%ref_4 = vm.call.variadic @hal.executable.create(%ref, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_3, %simple_mul_dispatch_0_embedded_elf_x86_64, %null_0, [%_executable_layout_0]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
vm.br ^bb3(%ref_4 : !vm.ref<!hal.executable>)
^bb2: // pred: ^bb0
vm.br ^bb3(%null : !vm.ref<!hal.executable>)
^bb3(%3: !vm.ref<!hal.executable>): // 2 preds: ^bb1, ^bb2
vm.global.store.ref %3, @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
vm.return
}
vm.export @__deinit
vm.func private @__deinit() {
vm.return
}
}
}
// -----// IR Dump After CSE //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
module attributes {hal.device.targets = [#device_target_cpu], vm.toplevel} {
vm.module public @module {
vm.global.i32 private mutable @_device_query_0 : i32
vm.global.ref private mutable @_executable_layout_0 : !vm.ref<!hal.executable_layout>
vm.global.ref private mutable @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
vm.rodata private @simple_mul_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0xvector<3808xi8>
vm.rodata private @_utf8_hal_executable_format_EAB228F999C2D3A1 {alignment = 1 : i64} dense<[104, 97, 108, 46, 101, 120, 101, 99, 117, 116, 97, 98, 108, 101, 46, 102, 111, 114, 109, 97, 116]> : vector<21xi8>
vm.rodata private @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 {alignment = 1 : i64} dense<[101, 109, 98, 101, 100, 100, 101, 100, 45, 101, 108, 102, 45, 120, 56, 54, 95, 54, 52]> : vector<19xi8>
vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.allocator.map.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %try : i32, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32 attributes {sym_visibility = "private"}
vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %pattern : i32, %pattern_length : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executa
View raw

(Sorry about that, but we can’t show files that are this big right now.)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment